kernel/cgroup/cgroup.c

   1 /*
   2  *  Generic process-grouping system.
   3  *
   4  *  Based originally on the cpuset system, extracted by Paul Menage
   5  *  Copyright (C) 2006 Google, Inc
   6  *
   7  *  Notifications support
   8  *  Copyright (C) 2009 Nokia Corporation
   9  *  Author: Kirill A. Shutemov
  10  *
  11  *  Copyright notices from the original cpuset code:
  12  *  --------------------------------------------------
  13  *  Copyright (C) 2003 BULL SA.
  14  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  15  *
  16  *  Portions derived from Patrick Mochel's sysfs code.
  17  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
  18  *
  19  *  2003-10-10 Written by Simon Derr.
  20  *  2003-10-22 Updates by Stephen Hemminger.
  21  *  2004 May-July Rework by Paul Jackson.
  22  *  ---------------------------------------------------
  23  *
  24  *  This file is subject to the terms and conditions of the GNU General Public
  25  *  License.  See the file COPYING in the main directory of the Linux
  26  *  distribution for more details.
  27  */
  28
  29 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  30
  31 #include "cgroup-internal.h"
  32
  33 #include <linux/bpf-cgroup.h>
  34 #include <linux/cred.h>
  35 #include <linux/errno.h>
  36 #include <linux/init_task.h>
  37 #include <linux/kernel.h>
  38 #include <linux/magic.h>
  39 #include <linux/mutex.h>
  40 #include <linux/mount.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/proc_fs.h>
  43 #include <linux/rcupdate.h>
  44 #include <linux/sched.h>
  45 #include <linux/sched/task.h>
  46 #include <linux/slab.h>
  47 #include <linux/spinlock.h>
  48 #include <linux/percpu-rwsem.h>
  49 #include <linux/string.h>
  50 #include <linux/hashtable.h>
  51 #include <linux/idr.h>
  52 #include <linux/kthread.h>
  53 #include <linux/atomic.h>
  54 #include <linux/cpuset.h>
  55 #include <linux/proc_ns.h>
  56 #include <linux/nsproxy.h>
  57 #include <linux/file.h>
  58 #include <linux/fs_parser.h>
  59 #include <linux/sched/cputime.h>
  60 #include <linux/sched/deadline.h>
  61 #include <linux/psi.h>
  62 #include <net/sock.h>
  63
  64 #define CREATE_TRACE_POINTS
  65 #include <trace/events/cgroup.h>
  66
  67 #define CGROUP_FILE_NAME_MAX            (MAX_CGROUP_TYPE_NAMELEN +      \
  68                                          MAX_CFTYPE_NAME + 2)
  69 /* let's not notify more than 100 times per second */
  70 #define CGROUP_FILE_NOTIFY_MIN_INTV     DIV_ROUND_UP(HZ, 100)
  71
  72 /*
  73  * To avoid confusing the compiler (and generating warnings) with code
  74  * that attempts to access what would be a 0-element array (i.e. sized
  75  * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
  76  * constant expression can be added.
  77  */
  78 #define CGROUP_HAS_SUBSYS_CONFIG        (CGROUP_SUBSYS_COUNT > 0)
  79
  80 /*
  81  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  82  * hierarchy must be performed while holding it.
  83  *
  84  * css_set_lock protects task->cgroups pointer, the list of css_set
  85  * objects, and the chain of tasks off each css_set.
  86  *
  87  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  88  * cgroup.h can use them for lockdep annotations.
  89  */
  90 DEFINE_MUTEX(cgroup_mutex);
  91 DEFINE_SPINLOCK(css_set_lock);
  92
  93 #ifdef CONFIG_PROVE_RCU
  94 EXPORT_SYMBOL_GPL(cgroup_mutex);
  95 EXPORT_SYMBOL_GPL(css_set_lock);
  96 #endif
  97
  98 DEFINE_SPINLOCK(trace_cgroup_path_lock);
  99 char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
 100 static bool cgroup_debug __read_mostly;
 101
 102 /*
 103  * Protects cgroup_idr and css_idr so that IDs can be released without
 104  * grabbing cgroup_mutex.
 105  */
 106 static DEFINE_SPINLOCK(cgroup_idr_lock);
 107
 108 /*
 109  * Protects cgroup_file->kn for !self csses.  It synchronizes notifications
 110  * against file removal/re-creation across css hiding.
 111  */
 112 static DEFINE_SPINLOCK(cgroup_file_kn_lock);
 113
 114 DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
 115
 116 #define cgroup_assert_mutex_or_rcu_locked()                             \
 117         RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
 118                            !lockdep_is_held(&cgroup_mutex),             \
 119                            "cgroup_mutex or RCU read lock required");
 120
 121 /*
 122  * cgroup destruction makes heavy use of work items and there can be a lot
 123  * of concurrent destructions.  Use a separate workqueue so that cgroup
 124  * destruction work items don't end up filling up max_active of system_wq
 125  * which may lead to deadlock.
 126  */
 127 static struct workqueue_struct *cgroup_destroy_wq;
 128
 129 /* generate an array of cgroup subsystem pointers */
 130 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
 131 struct cgroup_subsys *cgroup_subsys[] = {
 132 #include <linux/cgroup_subsys.h>
 133 };
 134 #undef SUBSYS
 135
 136 /* array of cgroup subsystem names */
 137 #define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
 138 static const char *cgroup_subsys_name[] = {
 139 #include <linux/cgroup_subsys.h>
 140 };
 141 #undef SUBSYS
 142
 143 /* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
 144 #define SUBSYS(_x)                                                              \
 145         DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);                 \
 146         DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);                  \
 147         EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);                      \
 148         EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
 149 #include <linux/cgroup_subsys.h>
 150 #undef SUBSYS
 151
 152 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
 153 static struct static_key_true *cgroup_subsys_enabled_key[] = {
 154 #include <linux/cgroup_subsys.h>
 155 };
 156 #undef SUBSYS
 157
 158 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
 159 static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
 160 #include <linux/cgroup_subsys.h>
 161 };
 162 #undef SUBSYS
 163
 164 static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
 165
 166 /* the default hierarchy */
 167 struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
 168 EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 169
 170 /*
 171  * The default hierarchy always exists but is hidden until mounted for the
 172  * first time.  This is for backward compatibility.
 173  */
 174 static bool cgrp_dfl_visible;
 175
 176 /* some controllers are not supported in the default hierarchy */
 177 static u16 cgrp_dfl_inhibit_ss_mask;
 178
 179 /* some controllers are implicitly enabled on the default hierarchy */
 180 static u16 cgrp_dfl_implicit_ss_mask;
 181
 182 /* some controllers can be threaded on the default hierarchy */
 183 static u16 cgrp_dfl_threaded_ss_mask;
 184
 185 /* The list of hierarchy roots */
 186 LIST_HEAD(cgroup_roots);
 187 static int cgroup_root_count;
 188
 189 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
 190 static DEFINE_IDR(cgroup_hierarchy_idr);
 191
 192 /*
 193  * Assign a monotonically increasing serial number to csses.  It guarantees
 194  * cgroups with bigger numbers are newer than those with smaller numbers.
 195  * Also, as csses are always appended to the parent's ->children list, it
 196  * guarantees that sibling csses are always sorted in the ascending serial
 197  * number order on the list.  Protected by cgroup_mutex.
 198  */
 199 static u64 css_serial_nr_next = 1;
 200
 201 /*
 202  * These bitmasks identify subsystems with specific features to avoid
 203  * having to do iterative checks repeatedly.
 204  */
 205 static u16 have_fork_callback __read_mostly;
 206 static u16 have_exit_callback __read_mostly;
 207 static u16 have_release_callback __read_mostly;
 208 static u16 have_canfork_callback __read_mostly;
 209
 210 /* cgroup namespace for init task */
 211 struct cgroup_namespace init_cgroup_ns = {
 212         .ns.count       = REFCOUNT_INIT(2),
 213         .user_ns        = &init_user_ns,
 214         .ns.ops         = &cgroupns_operations,
 215         .ns.inum        = PROC_CGROUP_INIT_INO,
 216         .root_cset      = &init_css_set,
 217 };
 218
 219 static struct file_system_type cgroup2_fs_type;
 220 static struct cftype cgroup_base_files[];
 221 static struct cftype cgroup_psi_files[];
 222
 223 /* cgroup optional features */
 224 enum cgroup_opt_features {
 225 #ifdef CONFIG_PSI
 226         OPT_FEATURE_PRESSURE,
 227 #endif
 228         OPT_FEATURE_COUNT
 229 };
 230
 231 static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
 232 #ifdef CONFIG_PSI
 233         "pressure",
 234 #endif
 235 };
 236
 237 static u16 cgroup_feature_disable_mask __read_mostly;
 238
 239 static int cgroup_apply_control(struct cgroup *cgrp);
 240 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
 241 static void css_task_iter_skip(struct css_task_iter *it,
 242                                struct task_struct *task);
 243 static int cgroup_destroy_locked(struct cgroup *cgrp);
 244 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 245                                               struct cgroup_subsys *ss);
 246 static void css_release(struct percpu_ref *ref);
 247 static void kill_css(struct cgroup_subsys_state *css);
 248 static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 249                               struct cgroup *cgrp, struct cftype cfts[],
 250                               bool is_add);
 251
 252 #ifdef CONFIG_DEBUG_CGROUP_REF
 253 #define CGROUP_REF_FN_ATTRS     noinline
 254 #define CGROUP_REF_EXPORT(fn)   EXPORT_SYMBOL_GPL(fn);
 255 #include <linux/cgroup_refcnt.h>
 256 #endif
 257
 258 /**
 259  * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
 260  * @ssid: subsys ID of interest
 261  *
 262  * cgroup_subsys_enabled() can only be used with literal subsys names which
 263  * is fine for individual subsystems but unsuitable for cgroup core.  This
 264  * is slower static_key_enabled() based test indexed by @ssid.
 265  */
 266 bool cgroup_ssid_enabled(int ssid)
 267 {
 268         if (!CGROUP_HAS_SUBSYS_CONFIG)
 269                 return false;
 270
 271         return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
 272 }
 273
 274 /**
 275  * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
 276  * @cgrp: the cgroup of interest
 277  *
 278  * The default hierarchy is the v2 interface of cgroup and this function
 279  * can be used to test whether a cgroup is on the default hierarchy for
 280  * cases where a subsystem should behave differently depending on the
 281  * interface version.
 282  *
 283  * List of changed behaviors:
 284  *
 285  * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
 286  *   and "name" are disallowed.
 287  *
 288  * - When mounting an existing superblock, mount options should match.
 289  *
 290  * - rename(2) is disallowed.
 291  *
 292  * - "tasks" is removed.  Everything should be at process granularity.  Use
 293  *   "cgroup.procs" instead.
 294  *
 295  * - "cgroup.procs" is not sorted.  pids will be unique unless they got
 296  *   recycled in-between reads.
 297  *
 298  * - "release_agent" and "notify_on_release" are removed.  Replacement
 299  *   notification mechanism will be implemented.
 300  *
 301  * - "cgroup.clone_children" is removed.
 302  *
 303  * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
 304  *   and its descendants contain no task; otherwise, 1.  The file also
 305  *   generates kernfs notification which can be monitored through poll and
 306  *   [di]notify when the value of the file changes.
 307  *
 308  * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
 309  *   take masks of ancestors with non-empty cpus/mems, instead of being
 310  *   moved to an ancestor.
 311  *
 312  * - cpuset: a task can be moved into an empty cpuset, and again it takes
 313  *   masks of ancestors.
 314  *
 315  * - blkcg: blk-throttle becomes properly hierarchical.
 316  */
 317 bool cgroup_on_dfl(const struct cgroup *cgrp)
 318 {
 319         return cgrp->root == &cgrp_dfl_root;
 320 }
 321
 322 /* IDR wrappers which synchronize using cgroup_idr_lock */
 323 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
 324                             gfp_t gfp_mask)
 325 {
 326         int ret;
 327
 328         idr_preload(gfp_mask);
 329         spin_lock_bh(&cgroup_idr_lock);
 330         ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
 331         spin_unlock_bh(&cgroup_idr_lock);
 332         idr_preload_end();
 333         return ret;
 334 }
 335
 336 static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
 337 {
 338         void *ret;
 339
 340         spin_lock_bh(&cgroup_idr_lock);
 341         ret = idr_replace(idr, ptr, id);
 342         spin_unlock_bh(&cgroup_idr_lock);
 343         return ret;
 344 }
 345
 346 static void cgroup_idr_remove(struct idr *idr, int id)
 347 {
 348         spin_lock_bh(&cgroup_idr_lock);
 349         idr_remove(idr, id);
 350         spin_unlock_bh(&cgroup_idr_lock);
 351 }
 352
 353 static bool cgroup_has_tasks(struct cgroup *cgrp)
 354 {
 355         return cgrp->nr_populated_csets;
 356 }
 357
 358 static bool cgroup_is_threaded(struct cgroup *cgrp)
 359 {
 360         return cgrp->dom_cgrp != cgrp;
 361 }
 362
 363 /* can @cgrp host both domain and threaded children? */
 364 static bool cgroup_is_mixable(struct cgroup *cgrp)
 365 {
 366         /*
 367          * Root isn't under domain level resource control exempting it from
 368          * the no-internal-process constraint, so it can serve as a thread
 369          * root and a parent of resource domains at the same time.
 370          */
 371         return !cgroup_parent(cgrp);
 372 }
 373
 374 /* can @cgrp become a thread root? Should always be true for a thread root */
 375 static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
 376 {
 377         /* mixables don't care */
 378         if (cgroup_is_mixable(cgrp))
 379                 return true;
 380
 381         /* domain roots can't be nested under threaded */
 382         if (cgroup_is_threaded(cgrp))
 383                 return false;
 384
 385         /* can only have either domain or threaded children */
 386         if (cgrp->nr_populated_domain_children)
 387                 return false;
 388
 389         /* and no domain controllers can be enabled */
 390         if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
 391                 return false;
 392
 393         return true;
 394 }
 395
 396 /* is @cgrp root of a threaded subtree? */
 397 static bool cgroup_is_thread_root(struct cgroup *cgrp)
 398 {
 399         /* thread root should be a domain */
 400         if (cgroup_is_threaded(cgrp))
 401                 return false;
 402
 403         /* a domain w/ threaded children is a thread root */
 404         if (cgrp->nr_threaded_children)
 405                 return true;
 406
 407         /*
 408          * A domain which has tasks and explicit threaded controllers
 409          * enabled is a thread root.
 410          */
 411         if (cgroup_has_tasks(cgrp) &&
 412             (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
 413                 return true;
 414
 415         return false;
 416 }
 417
 418 /* a domain which isn't connected to the root w/o brekage can't be used */
 419 static bool cgroup_is_valid_domain(struct cgroup *cgrp)
 420 {
 421         /* the cgroup itself can be a thread root */
 422         if (cgroup_is_threaded(cgrp))
 423                 return false;
 424
 425         /* but the ancestors can't be unless mixable */
 426         while ((cgrp = cgroup_parent(cgrp))) {
 427                 if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
 428                         return false;
 429                 if (cgroup_is_threaded(cgrp))
 430                         return false;
 431         }
 432
 433         return true;
 434 }
 435
 436 /* subsystems visibly enabled on a cgroup */
 437 static u16 cgroup_control(struct cgroup *cgrp)
 438 {
 439         struct cgroup *parent = cgroup_parent(cgrp);
 440         u16 root_ss_mask = cgrp->root->subsys_mask;
 441
 442         if (parent) {
 443                 u16 ss_mask = parent->subtree_control;
 444
 445                 /* threaded cgroups can only have threaded controllers */
 446                 if (cgroup_is_threaded(cgrp))
 447                         ss_mask &= cgrp_dfl_threaded_ss_mask;
 448                 return ss_mask;
 449         }
 450
 451         if (cgroup_on_dfl(cgrp))
 452                 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
 453                                   cgrp_dfl_implicit_ss_mask);
 454         return root_ss_mask;
 455 }
 456
 457 /* subsystems enabled on a cgroup */
 458 static u16 cgroup_ss_mask(struct cgroup *cgrp)
 459 {
 460         struct cgroup *parent = cgroup_parent(cgrp);
 461
 462         if (parent) {
 463                 u16 ss_mask = parent->subtree_ss_mask;
 464
 465                 /* threaded cgroups can only have threaded controllers */
 466                 if (cgroup_is_threaded(cgrp))
 467                         ss_mask &= cgrp_dfl_threaded_ss_mask;
 468                 return ss_mask;
 469         }
 470
 471         return cgrp->root->subsys_mask;
 472 }
 473
 474 /**
 475  * cgroup_css - obtain a cgroup's css for the specified subsystem
 476  * @cgrp: the cgroup of interest
 477  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 478  *
 479  * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 480  * function must be called either under cgroup_mutex or rcu_read_lock() and
 481  * the caller is responsible for pinning the returned css if it wants to
 482  * keep accessing it outside the said locks.  This function may return
 483  * %NULL if @cgrp doesn't have @subsys_id enabled.
 484  */
 485 static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 486                                               struct cgroup_subsys *ss)
 487 {
 488         if (CGROUP_HAS_SUBSYS_CONFIG && ss)
 489                 return rcu_dereference_check(cgrp->subsys[ss->id],
 490                                         lockdep_is_held(&cgroup_mutex));
 491         else
 492                 return &cgrp->self;
 493 }
 494
 495 /**
 496  * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
 497  * @cgrp: the cgroup of interest
 498  * @ss: the subsystem of interest
 499  *
 500  * Find and get @cgrp's css associated with @ss.  If the css doesn't exist
 501  * or is offline, %NULL is returned.
 502  */
 503 static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
 504                                                      struct cgroup_subsys *ss)
 505 {
 506         struct cgroup_subsys_state *css;
 507
 508         rcu_read_lock();
 509         css = cgroup_css(cgrp, ss);
 510         if (css && !css_tryget_online(css))
 511                 css = NULL;
 512         rcu_read_unlock();
 513
 514         return css;
 515 }
 516
 517 /**
 518  * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
 519  * @cgrp: the cgroup of interest
 520  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 521  *
 522  * Similar to cgroup_css() but returns the effective css, which is defined
 523  * as the matching css of the nearest ancestor including self which has @ss
 524  * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
 525  * function is guaranteed to return non-NULL css.
 526  */
 527 static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
 528                                                         struct cgroup_subsys *ss)
 529 {
 530         lockdep_assert_held(&cgroup_mutex);
 531
 532         if (!ss)
 533                 return &cgrp->self;
 534
 535         /*
 536          * This function is used while updating css associations and thus
 537          * can't test the csses directly.  Test ss_mask.
 538          */
 539         while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
 540                 cgrp = cgroup_parent(cgrp);
 541                 if (!cgrp)
 542                         return NULL;
 543         }
 544
 545         return cgroup_css(cgrp, ss);
 546 }
 547
 548 /**
 549  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
 550  * @cgrp: the cgroup of interest
 551  * @ss: the subsystem of interest
 552  *
 553  * Find and get the effective css of @cgrp for @ss.  The effective css is
 554  * defined as the matching css of the nearest ancestor including self which
 555  * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 556  * the root css is returned, so this function always returns a valid css.
 557  *
 558  * The returned css is not guaranteed to be online, and therefore it is the
 559  * callers responsibility to try get a reference for it.
 560  */
 561 struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
 562                                          struct cgroup_subsys *ss)
 563 {
 564         struct cgroup_subsys_state *css;
 565
 566         if (!CGROUP_HAS_SUBSYS_CONFIG)
 567                 return NULL;
 568
 569         do {
 570                 css = cgroup_css(cgrp, ss);
 571
 572                 if (css)
 573                         return css;
 574                 cgrp = cgroup_parent(cgrp);
 575         } while (cgrp);
 576
 577         return init_css_set.subsys[ss->id];
 578 }
 579
 580 /**
 581  * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
 582  * @cgrp: the cgroup of interest
 583  * @ss: the subsystem of interest
 584  *
 585  * Find and get the effective css of @cgrp for @ss.  The effective css is
 586  * defined as the matching css of the nearest ancestor including self which
 587  * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 588  * the root css is returned, so this function always returns a valid css.
 589  * The returned css must be put using css_put().
 590  */
 591 struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
 592                                              struct cgroup_subsys *ss)
 593 {
 594         struct cgroup_subsys_state *css;
 595
 596         if (!CGROUP_HAS_SUBSYS_CONFIG)
 597                 return NULL;
 598
 599         rcu_read_lock();
 600
 601         do {
 602                 css = cgroup_css(cgrp, ss);
 603
 604                 if (css && css_tryget_online(css))
 605                         goto out_unlock;
 606                 cgrp = cgroup_parent(cgrp);
 607         } while (cgrp);
 608
 609         css = init_css_set.subsys[ss->id];
 610         css_get(css);
 611 out_unlock:
 612         rcu_read_unlock();
 613         return css;
 614 }
 615 EXPORT_SYMBOL_GPL(cgroup_get_e_css);
 616
 617 static void cgroup_get_live(struct cgroup *cgrp)
 618 {
 619         WARN_ON_ONCE(cgroup_is_dead(cgrp));
 620         cgroup_get(cgrp);
 621 }
 622
 623 /**
 624  * __cgroup_task_count - count the number of tasks in a cgroup. The caller
 625  * is responsible for taking the css_set_lock.
 626  * @cgrp: the cgroup in question
 627  */
 628 int __cgroup_task_count(const struct cgroup *cgrp)
 629 {
 630         int count = 0;
 631         struct cgrp_cset_link *link;
 632
 633         lockdep_assert_held(&css_set_lock);
 634
 635         list_for_each_entry(link, &cgrp->cset_links, cset_link)
 636                 count += link->cset->nr_tasks;
 637
 638         return count;
 639 }
 640
 641 /**
 642  * cgroup_task_count - count the number of tasks in a cgroup.
 643  * @cgrp: the cgroup in question
 644  */
 645 int cgroup_task_count(const struct cgroup *cgrp)
 646 {
 647         int count;
 648
 649         spin_lock_irq(&css_set_lock);
 650         count = __cgroup_task_count(cgrp);
 651         spin_unlock_irq(&css_set_lock);
 652
 653         return count;
 654 }
 655
 656 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 657 {
 658         struct cgroup *cgrp = of->kn->parent->priv;
 659         struct cftype *cft = of_cft(of);
 660
 661         /*
 662          * This is open and unprotected implementation of cgroup_css().
 663          * seq_css() is only called from a kernfs file operation which has
 664          * an active reference on the file.  Because all the subsystem
 665          * files are drained before a css is disassociated with a cgroup,
 666          * the matching css from the cgroup's subsys table is guaranteed to
 667          * be and stay valid until the enclosing operation is complete.
 668          */
 669         if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss)
 670                 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
 671         else
 672                 return &cgrp->self;
 673 }
 674 EXPORT_SYMBOL_GPL(of_css);
 675
 676 /**
 677  * for_each_css - iterate all css's of a cgroup
 678  * @css: the iteration cursor
 679  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 680  * @cgrp: the target cgroup to iterate css's of
 681  *
 682  * Should be called under cgroup_[tree_]mutex.
 683  */
 684 #define for_each_css(css, ssid, cgrp)                                   \
 685         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
 686                 if (!((css) = rcu_dereference_check(                    \
 687                                 (cgrp)->subsys[(ssid)],                 \
 688                                 lockdep_is_held(&cgroup_mutex)))) { }   \
 689                 else
 690
 691 /**
 692  * do_each_subsys_mask - filter for_each_subsys with a bitmask
 693  * @ss: the iteration cursor
 694  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 695  * @ss_mask: the bitmask
 696  *
 697  * The block will only run for cases where the ssid-th bit (1 << ssid) of
 698  * @ss_mask is set.
 699  */
 700 #define do_each_subsys_mask(ss, ssid, ss_mask) do {                     \
 701         unsigned long __ss_mask = (ss_mask);                            \
 702         if (!CGROUP_HAS_SUBSYS_CONFIG) {                                \
 703                 (ssid) = 0;                                             \
 704                 break;                                                  \
 705         }                                                               \
 706         for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) {       \
 707                 (ss) = cgroup_subsys[ssid];                             \
 708                 {
 709
 710 #define while_each_subsys_mask()                                        \
 711                 }                                                       \
 712         }                                                               \
 713 } while (false)
 714
 715 /* iterate over child cgrps, lock should be held throughout iteration */
 716 #define cgroup_for_each_live_child(child, cgrp)                         \
 717         list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
 718                 if (({ lockdep_assert_held(&cgroup_mutex);              \
 719                        cgroup_is_dead(child); }))                       \
 720                         ;                                               \
 721                 else
 722
 723 /* walk live descendants in pre order */
 724 #define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)          \
 725         css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))  \
 726                 if (({ lockdep_assert_held(&cgroup_mutex);              \
 727                        (dsct) = (d_css)->cgroup;                        \
 728                        cgroup_is_dead(dsct); }))                        \
 729                         ;                                               \
 730                 else
 731
 732 /* walk live descendants in postorder */
 733 #define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)         \
 734         css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
 735                 if (({ lockdep_assert_held(&cgroup_mutex);              \
 736                        (dsct) = (d_css)->cgroup;                        \
 737                        cgroup_is_dead(dsct); }))                        \
 738                         ;                                               \
 739                 else
 740
 741 /*
 742  * The default css_set - used by init and its children prior to any
 743  * hierarchies being mounted. It contains a pointer to the root state
 744  * for each subsystem. Also used to anchor the list of css_sets. Not
 745  * reference-counted, to improve performance when child cgroups
 746  * haven't been created.
 747  */
 748 struct css_set init_css_set = {
 749         .refcount               = REFCOUNT_INIT(1),
 750         .dom_cset               = &init_css_set,
 751         .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
 752         .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
 753         .dying_tasks            = LIST_HEAD_INIT(init_css_set.dying_tasks),
 754         .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
 755         .threaded_csets         = LIST_HEAD_INIT(init_css_set.threaded_csets),
 756         .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
 757         .mg_src_preload_node    = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
 758         .mg_dst_preload_node    = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
 759         .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
 760
 761         /*
 762          * The following field is re-initialized when this cset gets linked
 763          * in cgroup_init().  However, let's initialize the field
 764          * statically too so that the default cgroup can be accessed safely
 765          * early during boot.
 766          */
 767         .dfl_cgrp               = &cgrp_dfl_root.cgrp,
 768 };
 769
 770 static int css_set_count        = 1;    /* 1 for init_css_set */
 771
 772 static bool css_set_threaded(struct css_set *cset)
 773 {
 774         return cset->dom_cset != cset;
 775 }
 776
 777 /**
 778  * css_set_populated - does a css_set contain any tasks?
 779  * @cset: target css_set
 780  *
 781  * css_set_populated() should be the same as !!cset->nr_tasks at steady
 782  * state. However, css_set_populated() can be called while a task is being
 783  * added to or removed from the linked list before the nr_tasks is
 784  * properly updated. Hence, we can't just look at ->nr_tasks here.
 785  */
 786 static bool css_set_populated(struct css_set *cset)
 787 {
 788         lockdep_assert_held(&css_set_lock);
 789
 790         return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
 791 }
 792
 793 /**
 794  * cgroup_update_populated - update the populated count of a cgroup
 795  * @cgrp: the target cgroup
 796  * @populated: inc or dec populated count
 797  *
 798  * One of the css_sets associated with @cgrp is either getting its first
 799  * task or losing the last.  Update @cgrp->nr_populated_* accordingly.  The
 800  * count is propagated towards root so that a given cgroup's
 801  * nr_populated_children is zero iff none of its descendants contain any
 802  * tasks.
 803  *
 804  * @cgrp's interface file "cgroup.populated" is zero if both
 805  * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
 806  * 1 otherwise.  When the sum changes from or to zero, userland is notified
 807  * that the content of the interface file has changed.  This can be used to
 808  * detect when @cgrp and its descendants become populated or empty.
 809  */
 810 static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 811 {
 812         struct cgroup *child = NULL;
 813         int adj = populated ? 1 : -1;
 814
 815         lockdep_assert_held(&css_set_lock);
 816
 817         do {
 818                 bool was_populated = cgroup_is_populated(cgrp);
 819
 820                 if (!child) {
 821                         cgrp->nr_populated_csets += adj;
 822                 } else {
 823                         if (cgroup_is_threaded(child))
 824                                 cgrp->nr_populated_threaded_children += adj;
 825                         else
 826                                 cgrp->nr_populated_domain_children += adj;
 827                 }
 828
 829                 if (was_populated == cgroup_is_populated(cgrp))
 830                         break;
 831
 832                 cgroup1_check_for_release(cgrp);
 833                 TRACE_CGROUP_PATH(notify_populated, cgrp,
 834                                   cgroup_is_populated(cgrp));
 835                 cgroup_file_notify(&cgrp->events_file);
 836
 837                 child = cgrp;
 838                 cgrp = cgroup_parent(cgrp);
 839         } while (cgrp);
 840 }
 841
 842 /**
 843  * css_set_update_populated - update populated state of a css_set
 844  * @cset: target css_set
 845  * @populated: whether @cset is populated or depopulated
 846  *
 847  * @cset is either getting the first task or losing the last.  Update the
 848  * populated counters of all associated cgroups accordingly.
 849  */
 850 static void css_set_update_populated(struct css_set *cset, bool populated)
 851 {
 852         struct cgrp_cset_link *link;
 853
 854         lockdep_assert_held(&css_set_lock);
 855
 856         list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
 857                 cgroup_update_populated(link->cgrp, populated);
 858 }
 859
 860 /*
 861  * @task is leaving, advance task iterators which are pointing to it so
 862  * that they can resume at the next position.  Advancing an iterator might
 863  * remove it from the list, use safe walk.  See css_task_iter_skip() for
 864  * details.
 865  */
 866 static void css_set_skip_task_iters(struct css_set *cset,
 867                                     struct task_struct *task)
 868 {
 869         struct css_task_iter *it, *pos;
 870
 871         list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
 872                 css_task_iter_skip(it, task);
 873 }
 874
 875 /**
 876  * css_set_move_task - move a task from one css_set to another
 877  * @task: task being moved
 878  * @from_cset: css_set @task currently belongs to (may be NULL)
 879  * @to_cset: new css_set @task is being moved to (may be NULL)
 880  * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
 881  *
 882  * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
 883  * css_set, @from_cset can be NULL.  If @task is being disassociated
 884  * instead of moved, @to_cset can be NULL.
 885  *
 886  * This function automatically handles populated counter updates and
 887  * css_task_iter adjustments but the caller is responsible for managing
 888  * @from_cset and @to_cset's reference counts.
 889  */
 890 static void css_set_move_task(struct task_struct *task,
 891                               struct css_set *from_cset, struct css_set *to_cset,
 892                               bool use_mg_tasks)
 893 {
 894         lockdep_assert_held(&css_set_lock);
 895
 896         if (to_cset && !css_set_populated(to_cset))
 897                 css_set_update_populated(to_cset, true);
 898
 899         if (from_cset) {
 900                 WARN_ON_ONCE(list_empty(&task->cg_list));
 901
 902                 css_set_skip_task_iters(from_cset, task);
 903                 list_del_init(&task->cg_list);
 904                 if (!css_set_populated(from_cset))
 905                         css_set_update_populated(from_cset, false);
 906         } else {
 907                 WARN_ON_ONCE(!list_empty(&task->cg_list));
 908         }
 909
 910         if (to_cset) {
 911                 /*
 912                  * We are synchronized through cgroup_threadgroup_rwsem
 913                  * against PF_EXITING setting such that we can't race
 914                  * against cgroup_exit()/cgroup_free() dropping the css_set.
 915                  */
 916                 WARN_ON_ONCE(task->flags & PF_EXITING);
 917
 918                 cgroup_move_task(task, to_cset);
 919                 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
 920                                                              &to_cset->tasks);
 921         }
 922 }
 923
 924 /*
 925  * hash table for cgroup groups. This improves the performance to find
 926  * an existing css_set. This hash doesn't (currently) take into
 927  * account cgroups in empty hierarchies.
 928  */
 929 #define CSS_SET_HASH_BITS       7
 930 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
 931
 932 static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 933 {
 934         unsigned long key = 0UL;
 935         struct cgroup_subsys *ss;
 936         int i;
 937
 938         for_each_subsys(ss, i)
 939                 key += (unsigned long)css[i];
 940         key = (key >> 16) ^ key;
 941
 942         return key;
 943 }
 944
 945 void put_css_set_locked(struct css_set *cset)
 946 {
 947         struct cgrp_cset_link *link, *tmp_link;
 948         struct cgroup_subsys *ss;
 949         int ssid;
 950
 951         lockdep_assert_held(&css_set_lock);
 952
 953         if (!refcount_dec_and_test(&cset->refcount))
 954                 return;
 955
 956         WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
 957
 958         /* This css_set is dead. Unlink it and release cgroup and css refs */
 959         for_each_subsys(ss, ssid) {
 960                 list_del(&cset->e_cset_node[ssid]);
 961                 css_put(cset->subsys[ssid]);
 962         }
 963         hash_del(&cset->hlist);
 964         css_set_count--;
 965
 966         list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
 967                 list_del(&link->cset_link);
 968                 list_del(&link->cgrp_link);
 969                 if (cgroup_parent(link->cgrp))
 970                         cgroup_put(link->cgrp);
 971                 kfree(link);
 972         }
 973
 974         if (css_set_threaded(cset)) {
 975                 list_del(&cset->threaded_csets_node);
 976                 put_css_set_locked(cset->dom_cset);
 977         }
 978
 979         kfree_rcu(cset, rcu_head);
 980 }
 981
 982 /**
 983  * compare_css_sets - helper function for find_existing_css_set().
 984  * @cset: candidate css_set being tested
 985  * @old_cset: existing css_set for a task
 986  * @new_cgrp: cgroup that's being entered by the task
 987  * @template: desired set of css pointers in css_set (pre-calculated)
 988  *
 989  * Returns true if "cset" matches "old_cset" except for the hierarchy
 990  * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 991  */
 992 static bool compare_css_sets(struct css_set *cset,
 993                              struct css_set *old_cset,
 994                              struct cgroup *new_cgrp,
 995                              struct cgroup_subsys_state *template[])
 996 {
 997         struct cgroup *new_dfl_cgrp;
 998         struct list_head *l1, *l2;
 999
1000         /*
1001          * On the default hierarchy, there can be csets which are
1002          * associated with the same set of cgroups but different csses.
1003          * Let's first ensure that csses match.
1004          */
1005         if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
1006                 return false;
1007
1008
1009         /* @cset's domain should match the default cgroup's */
1010         if (cgroup_on_dfl(new_cgrp))
1011                 new_dfl_cgrp = new_cgrp;
1012         else
1013                 new_dfl_cgrp = old_cset->dfl_cgrp;
1014
1015         if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
1016                 return false;
1017
1018         /*
1019          * Compare cgroup pointers in order to distinguish between
1020          * different cgroups in hierarchies.  As different cgroups may
1021          * share the same effective css, this comparison is always
1022          * necessary.
1023          */
1024         l1 = &cset->cgrp_links;
1025         l2 = &old_cset->cgrp_links;
1026         while (1) {
1027                 struct cgrp_cset_link *link1, *link2;
1028                 struct cgroup *cgrp1, *cgrp2;
1029
1030                 l1 = l1->next;
1031                 l2 = l2->next;
1032                 /* See if we reached the end - both lists are equal length. */
1033                 if (l1 == &cset->cgrp_links) {
1034                         BUG_ON(l2 != &old_cset->cgrp_links);
1035                         break;
1036                 } else {
1037                         BUG_ON(l2 == &old_cset->cgrp_links);
1038                 }
1039                 /* Locate the cgroups associated with these links. */
1040                 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
1041                 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
1042                 cgrp1 = link1->cgrp;
1043                 cgrp2 = link2->cgrp;
1044                 /* Hierarchies should be linked in the same order. */
1045                 BUG_ON(cgrp1->root != cgrp2->root);
1046
1047                 /*
1048                  * If this hierarchy is the hierarchy of the cgroup
1049                  * that's changing, then we need to check that this
1050                  * css_set points to the new cgroup; if it's any other
1051                  * hierarchy, then this css_set should point to the
1052                  * same cgroup as the old css_set.
1053                  */
1054                 if (cgrp1->root == new_cgrp->root) {
1055                         if (cgrp1 != new_cgrp)
1056                                 return false;
1057                 } else {
1058                         if (cgrp1 != cgrp2)
1059                                 return false;
1060                 }
1061         }
1062         return true;
1063 }
1064
1065 /**
1066  * find_existing_css_set - init css array and find the matching css_set
1067  * @old_cset: the css_set that we're using before the cgroup transition
1068  * @cgrp: the cgroup that we're moving into
1069  * @template: out param for the new set of csses, should be clear on entry
1070  */
1071 static struct css_set *find_existing_css_set(struct css_set *old_cset,
1072                                         struct cgroup *cgrp,
1073                                         struct cgroup_subsys_state *template[])
1074 {
1075         struct cgroup_root *root = cgrp->root;
1076         struct cgroup_subsys *ss;
1077         struct css_set *cset;
1078         unsigned long key;
1079         int i;
1080
1081         /*
1082          * Build the set of subsystem state objects that we want to see in the
1083          * new css_set. While subsystems can change globally, the entries here
1084          * won't change, so no need for locking.
1085          */
1086         for_each_subsys(ss, i) {
1087                 if (root->subsys_mask & (1UL << i)) {
1088                         /*
1089                          * @ss is in this hierarchy, so we want the
1090                          * effective css from @cgrp.
1091                          */
1092                         template[i] = cgroup_e_css_by_mask(cgrp, ss);
1093                 } else {
1094                         /*
1095                          * @ss is not in this hierarchy, so we don't want
1096                          * to change the css.
1097                          */
1098                         template[i] = old_cset->subsys[i];
1099                 }
1100         }
1101
1102         key = css_set_hash(template);
1103         hash_for_each_possible(css_set_table, cset, hlist, key) {
1104                 if (!compare_css_sets(cset, old_cset, cgrp, template))
1105                         continue;
1106
1107                 /* This css_set matches what we need */
1108                 return cset;
1109         }
1110
1111         /* No existing cgroup group matched */
1112         return NULL;
1113 }
1114
1115 static void free_cgrp_cset_links(struct list_head *links_to_free)
1116 {
1117         struct cgrp_cset_link *link, *tmp_link;
1118
1119         list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
1120                 list_del(&link->cset_link);
1121                 kfree(link);
1122         }
1123 }
1124
1125 /**
1126  * allocate_cgrp_cset_links - allocate cgrp_cset_links
1127  * @count: the number of links to allocate
1128  * @tmp_links: list_head the allocated links are put on
1129  *
1130  * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
1131  * through ->cset_link.  Returns 0 on success or -errno.
1132  */
1133 static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
1134 {
1135         struct cgrp_cset_link *link;
1136         int i;
1137
1138         INIT_LIST_HEAD(tmp_links);
1139
1140         for (i = 0; i < count; i++) {
1141                 link = kzalloc(sizeof(*link), GFP_KERNEL);
1142                 if (!link) {
1143                         free_cgrp_cset_links(tmp_links);
1144                         return -ENOMEM;
1145                 }
1146                 list_add(&link->cset_link, tmp_links);
1147         }
1148         return 0;
1149 }
1150
1151 /**
1152  * link_css_set - a helper function to link a css_set to a cgroup
1153  * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
1154  * @cset: the css_set to be linked
1155  * @cgrp: the destination cgroup
1156  */
1157 static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
1158                          struct cgroup *cgrp)
1159 {
1160         struct cgrp_cset_link *link;
1161
1162         BUG_ON(list_empty(tmp_links));
1163
1164         if (cgroup_on_dfl(cgrp))
1165                 cset->dfl_cgrp = cgrp;
1166
1167         link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
1168         link->cset = cset;
1169         link->cgrp = cgrp;
1170
1171         /*
1172          * Always add links to the tail of the lists so that the lists are
1173          * in chronological order.
1174          */
1175         list_move_tail(&link->cset_link, &cgrp->cset_links);
1176         list_add_tail(&link->cgrp_link, &cset->cgrp_links);
1177
1178         if (cgroup_parent(cgrp))
1179                 cgroup_get_live(cgrp);
1180 }
1181
1182 /**
1183  * find_css_set - return a new css_set with one cgroup updated
1184  * @old_cset: the baseline css_set
1185  * @cgrp: the cgroup to be updated
1186  *
1187  * Return a new css_set that's equivalent to @old_cset, but with @cgrp
1188  * substituted into the appropriate hierarchy.
1189  */
1190 static struct css_set *find_css_set(struct css_set *old_cset,
1191                                     struct cgroup *cgrp)
1192 {
1193         struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
1194         struct css_set *cset;
1195         struct list_head tmp_links;
1196         struct cgrp_cset_link *link;
1197         struct cgroup_subsys *ss;
1198         unsigned long key;
1199         int ssid;
1200
1201         lockdep_assert_held(&cgroup_mutex);
1202
1203         /* First see if we already have a cgroup group that matches
1204          * the desired set */
1205         spin_lock_irq(&css_set_lock);
1206         cset = find_existing_css_set(old_cset, cgrp, template);
1207         if (cset)
1208                 get_css_set(cset);
1209         spin_unlock_irq(&css_set_lock);
1210
1211         if (cset)
1212                 return cset;
1213
1214         cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1215         if (!cset)
1216                 return NULL;
1217
1218         /* Allocate all the cgrp_cset_link objects that we'll need */
1219         if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
1220                 kfree(cset);
1221                 return NULL;
1222         }
1223
1224         refcount_set(&cset->refcount, 1);
1225         cset->dom_cset = cset;
1226         INIT_LIST_HEAD(&cset->tasks);
1227         INIT_LIST_HEAD(&cset->mg_tasks);
1228         INIT_LIST_HEAD(&cset->dying_tasks);
1229         INIT_LIST_HEAD(&cset->task_iters);
1230         INIT_LIST_HEAD(&cset->threaded_csets);
1231         INIT_HLIST_NODE(&cset->hlist);
1232         INIT_LIST_HEAD(&cset->cgrp_links);
1233         INIT_LIST_HEAD(&cset->mg_src_preload_node);
1234         INIT_LIST_HEAD(&cset->mg_dst_preload_node);
1235         INIT_LIST_HEAD(&cset->mg_node);
1236
1237         /* Copy the set of subsystem state objects generated in
1238          * find_existing_css_set() */
1239         memcpy(cset->subsys, template, sizeof(cset->subsys));
1240
1241         spin_lock_irq(&css_set_lock);
1242         /* Add reference counts and links from the new css_set. */
1243         list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1244                 struct cgroup *c = link->cgrp;
1245
1246                 if (c->root == cgrp->root)
1247                         c = cgrp;
1248                 link_css_set(&tmp_links, cset, c);
1249         }
1250
1251         BUG_ON(!list_empty(&tmp_links));
1252
1253         css_set_count++;
1254
1255         /* Add @cset to the hash table */
1256         key = css_set_hash(cset->subsys);
1257         hash_add(css_set_table, &cset->hlist, key);
1258
1259         for_each_subsys(ss, ssid) {
1260                 struct cgroup_subsys_state *css = cset->subsys[ssid];
1261
1262                 list_add_tail(&cset->e_cset_node[ssid],
1263                               &css->cgroup->e_csets[ssid]);
1264                 css_get(css);
1265         }
1266
1267         spin_unlock_irq(&css_set_lock);
1268
1269         /*
1270          * If @cset should be threaded, look up the matching dom_cset and
1271          * link them up.  We first fully initialize @cset then look for the
1272          * dom_cset.  It's simpler this way and safe as @cset is guaranteed
1273          * to stay empty until we return.
1274          */
1275         if (cgroup_is_threaded(cset->dfl_cgrp)) {
1276                 struct css_set *dcset;
1277
1278                 dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
1279                 if (!dcset) {
1280                         put_css_set(cset);
1281                         return NULL;
1282                 }
1283
1284                 spin_lock_irq(&css_set_lock);
1285                 cset->dom_cset = dcset;
1286                 list_add_tail(&cset->threaded_csets_node,
1287                               &dcset->threaded_csets);
1288                 spin_unlock_irq(&css_set_lock);
1289         }
1290
1291         return cset;
1292 }
1293
1294 struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1295 {
1296         struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv;
1297
1298         return root_cgrp->root;
1299 }
1300
1301 void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
1302 {
1303         bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
1304
1305         /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
1306         if (favor && !favoring) {
1307                 rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
1308                 root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
1309         } else if (!favor && favoring) {
1310                 rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
1311                 root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
1312         }
1313 }
1314
1315 static int cgroup_init_root_id(struct cgroup_root *root)
1316 {
1317         int id;
1318
1319         lockdep_assert_held(&cgroup_mutex);
1320
1321         id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
1322         if (id < 0)
1323                 return id;
1324
1325         root->hierarchy_id = id;
1326         return 0;
1327 }
1328
1329 static void cgroup_exit_root_id(struct cgroup_root *root)
1330 {
1331         lockdep_assert_held(&cgroup_mutex);
1332
1333         idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1334 }
1335
1336 void cgroup_free_root(struct cgroup_root *root)
1337 {
1338         kfree(root);
1339 }
1340
1341 static void cgroup_destroy_root(struct cgroup_root *root)
1342 {
1343         struct cgroup *cgrp = &root->cgrp;
1344         struct cgrp_cset_link *link, *tmp_link;
1345
1346         trace_cgroup_destroy_root(root);
1347
1348         cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1349
1350         BUG_ON(atomic_read(&root->nr_cgrps));
1351         BUG_ON(!list_empty(&cgrp->self.children));
1352
1353         /* Rebind all subsystems back to the default hierarchy */
1354         WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
1355
1356         /*
1357          * Release all the links from cset_links to this hierarchy's
1358          * root cgroup
1359          */
1360         spin_lock_irq(&css_set_lock);
1361
1362         list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1363                 list_del(&link->cset_link);
1364                 list_del(&link->cgrp_link);
1365                 kfree(link);
1366         }
1367
1368         spin_unlock_irq(&css_set_lock);
1369
1370         if (!list_empty(&root->root_list)) {
1371                 list_del(&root->root_list);
1372                 cgroup_root_count--;
1373         }
1374
1375         cgroup_favor_dynmods(root, false);
1376         cgroup_exit_root_id(root);
1377
1378         cgroup_unlock();
1379
1380         cgroup_rstat_exit(cgrp);
1381         kernfs_destroy_root(root->kf_root);
1382         cgroup_free_root(root);
1383 }
1384
1385 /*
1386  * Returned cgroup is without refcount but it's valid as long as cset pins it.
1387  */
1388 static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
1389                                             struct cgroup_root *root)
1390 {
1391         struct cgroup *res_cgroup = NULL;
1392
1393         if (cset == &init_css_set) {
1394                 res_cgroup = &root->cgrp;
1395         } else if (root == &cgrp_dfl_root) {
1396                 res_cgroup = cset->dfl_cgrp;
1397         } else {
1398                 struct cgrp_cset_link *link;
1399                 lockdep_assert_held(&css_set_lock);
1400
1401                 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1402                         struct cgroup *c = link->cgrp;
1403
1404                         if (c->root == root) {
1405                                 res_cgroup = c;
1406                                 break;
1407                         }
1408                 }
1409         }
1410
1411         BUG_ON(!res_cgroup);
1412         return res_cgroup;
1413 }
1414
1415 /*
1416  * look up cgroup associated with current task's cgroup namespace on the
1417  * specified hierarchy
1418  */
1419 static struct cgroup *
1420 current_cgns_cgroup_from_root(struct cgroup_root *root)
1421 {
1422         struct cgroup *res = NULL;
1423         struct css_set *cset;
1424
1425         lockdep_assert_held(&css_set_lock);
1426
1427         rcu_read_lock();
1428
1429         cset = current->nsproxy->cgroup_ns->root_cset;
1430         res = __cset_cgroup_from_root(cset, root);
1431
1432         rcu_read_unlock();
1433
1434         return res;
1435 }
1436
1437 /*
1438  * Look up cgroup associated with current task's cgroup namespace on the default
1439  * hierarchy.
1440  *
1441  * Unlike current_cgns_cgroup_from_root(), this doesn't need locks:
1442  * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu
1443  *   pointers.
1444  * - css_set_lock is not needed because we just read cset->dfl_cgrp.
1445  * - As a bonus returned cgrp is pinned with the current because it cannot
1446  *   switch cgroup_ns asynchronously.
1447  */
1448 static struct cgroup *current_cgns_cgroup_dfl(void)
1449 {
1450         struct css_set *cset;
1451
1452         if (current->nsproxy) {
1453                 cset = current->nsproxy->cgroup_ns->root_cset;
1454                 return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
1455         } else {
1456                 /*
1457                  * NOTE: This function may be called from bpf_cgroup_from_id()
1458                  * on a task which has already passed exit_task_namespaces() and
1459                  * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all
1460                  * cgroups visible for lookups.
1461                  */
1462                 return &cgrp_dfl_root.cgrp;
1463         }
1464 }
1465
1466 /* look up cgroup associated with given css_set on the specified hierarchy */
1467 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1468                                             struct cgroup_root *root)
1469 {
1470         lockdep_assert_held(&cgroup_mutex);
1471         lockdep_assert_held(&css_set_lock);
1472
1473         return __cset_cgroup_from_root(cset, root);
1474 }
1475
1476 /*
1477  * Return the cgroup for "task" from the given hierarchy. Must be
1478  * called with cgroup_mutex and css_set_lock held.
1479  */
1480 struct cgroup *task_cgroup_from_root(struct task_struct *task,
1481                                      struct cgroup_root *root)
1482 {
1483         /*
1484          * No need to lock the task - since we hold css_set_lock the
1485          * task can't change groups.
1486          */
1487         return cset_cgroup_from_root(task_css_set(task), root);
1488 }
1489
1490 /*
1491  * A task must hold cgroup_mutex to modify cgroups.
1492  *
1493  * Any task can increment and decrement the count field without lock.
1494  * So in general, code holding cgroup_mutex can't rely on the count
1495  * field not changing.  However, if the count goes to zero, then only
1496  * cgroup_attach_task() can increment it again.  Because a count of zero
1497  * means that no tasks are currently attached, therefore there is no
1498  * way a task attached to that cgroup can fork (the other way to
1499  * increment the count).  So code holding cgroup_mutex can safely
1500  * assume that if the count is zero, it will stay zero. Similarly, if
1501  * a task holds cgroup_mutex on a cgroup with zero count, it
1502  * knows that the cgroup won't be removed, as cgroup_rmdir()
1503  * needs that mutex.
1504  *
1505  * A cgroup can only be deleted if both its 'count' of using tasks
1506  * is zero, and its list of 'children' cgroups is empty.  Since all
1507  * tasks in the system use _some_ cgroup, and since there is always at
1508  * least one task in the system (init, pid == 1), therefore, root cgroup
1509  * always has either children cgroups and/or using tasks.  So we don't
1510  * need a special hack to ensure that root cgroup cannot be deleted.
1511  *
1512  * P.S.  One more locking exception.  RCU is used to guard the
1513  * update of a tasks cgroup pointer by cgroup_attach_task()
1514  */
1515
1516 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1517
1518 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1519                               char *buf)
1520 {
1521         struct cgroup_subsys *ss = cft->ss;
1522
1523         if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1524             !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
1525                 const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
1526
1527                 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
1528                          dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1529                          cft->name);
1530         } else {
1531                 strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1532         }
1533         return buf;
1534 }
1535
1536 /**
1537  * cgroup_file_mode - deduce file mode of a control file
1538  * @cft: the control file in question
1539  *
1540  * S_IRUGO for read, S_IWUSR for write.
1541  */
1542 static umode_t cgroup_file_mode(const struct cftype *cft)
1543 {
1544         umode_t mode = 0;
1545
1546         if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1547                 mode |= S_IRUGO;
1548
1549         if (cft->write_u64 || cft->write_s64 || cft->write) {
1550                 if (cft->flags & CFTYPE_WORLD_WRITABLE)
1551                         mode |= S_IWUGO;
1552                 else
1553                         mode |= S_IWUSR;
1554         }
1555
1556         return mode;
1557 }
1558
1559 /**
1560  * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
1561  * @subtree_control: the new subtree_control mask to consider
1562  * @this_ss_mask: available subsystems
1563  *
1564  * On the default hierarchy, a subsystem may request other subsystems to be
1565  * enabled together through its ->depends_on mask.  In such cases, more
1566  * subsystems than specified in "cgroup.subtree_control" may be enabled.
1567  *
1568  * This function calculates which subsystems need to be enabled if
1569  * @subtree_control is to be applied while restricted to @this_ss_mask.
1570  */
1571 static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
1572 {
1573         u16 cur_ss_mask = subtree_control;
1574         struct cgroup_subsys *ss;
1575         int ssid;
1576
1577         lockdep_assert_held(&cgroup_mutex);
1578
1579         cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
1580
1581         while (true) {
1582                 u16 new_ss_mask = cur_ss_mask;
1583
1584                 do_each_subsys_mask(ss, ssid, cur_ss_mask) {
1585                         new_ss_mask |= ss->depends_on;
1586                 } while_each_subsys_mask();
1587
1588                 /*
1589                  * Mask out subsystems which aren't available.  This can
1590                  * happen only if some depended-upon subsystems were bound
1591                  * to non-default hierarchies.
1592                  */
1593                 new_ss_mask &= this_ss_mask;
1594
1595                 if (new_ss_mask == cur_ss_mask)
1596                         break;
1597                 cur_ss_mask = new_ss_mask;
1598         }
1599
1600         return cur_ss_mask;
1601 }
1602
1603 /**
1604  * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1605  * @kn: the kernfs_node being serviced
1606  *
1607  * This helper undoes cgroup_kn_lock_live() and should be invoked before
1608  * the method finishes if locking succeeded.  Note that once this function
1609  * returns the cgroup returned by cgroup_kn_lock_live() may become
1610  * inaccessible any time.  If the caller intends to continue to access the
1611  * cgroup, it should pin it before invoking this function.
1612  */
1613 void cgroup_kn_unlock(struct kernfs_node *kn)
1614 {
1615         struct cgroup *cgrp;
1616
1617         if (kernfs_type(kn) == KERNFS_DIR)
1618                 cgrp = kn->priv;
1619         else
1620                 cgrp = kn->parent->priv;
1621
1622         cgroup_unlock();
1623
1624         kernfs_unbreak_active_protection(kn);
1625         cgroup_put(cgrp);
1626 }
1627
1628 /**
1629  * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
1630  * @kn: the kernfs_node being serviced
1631  * @drain_offline: perform offline draining on the cgroup
1632  *
1633  * This helper is to be used by a cgroup kernfs method currently servicing
1634  * @kn.  It breaks the active protection, performs cgroup locking and
1635  * verifies that the associated cgroup is alive.  Returns the cgroup if
1636  * alive; otherwise, %NULL.  A successful return should be undone by a
1637  * matching cgroup_kn_unlock() invocation.  If @drain_offline is %true, the
1638  * cgroup is drained of offlining csses before return.
1639  *
1640  * Any cgroup kernfs method implementation which requires locking the
1641  * associated cgroup should use this helper.  It avoids nesting cgroup
1642  * locking under kernfs active protection and allows all kernfs operations
1643  * including self-removal.
1644  */
1645 struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
1646 {
1647         struct cgroup *cgrp;
1648
1649         if (kernfs_type(kn) == KERNFS_DIR)
1650                 cgrp = kn->priv;
1651         else
1652                 cgrp = kn->parent->priv;
1653
1654         /*
1655          * We're gonna grab cgroup_mutex which nests outside kernfs
1656          * active_ref.  cgroup liveliness check alone provides enough
1657          * protection against removal.  Ensure @cgrp stays accessible and
1658          * break the active_ref protection.
1659          */
1660         if (!cgroup_tryget(cgrp))
1661                 return NULL;
1662         kernfs_break_active_protection(kn);
1663
1664         if (drain_offline)
1665                 cgroup_lock_and_drain_offline(cgrp);
1666         else
1667                 cgroup_lock();
1668
1669         if (!cgroup_is_dead(cgrp))
1670                 return cgrp;
1671
1672         cgroup_kn_unlock(kn);
1673         return NULL;
1674 }
1675
1676 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1677 {
1678         char name[CGROUP_FILE_NAME_MAX];
1679
1680         lockdep_assert_held(&cgroup_mutex);
1681
1682         if (cft->file_offset) {
1683                 struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
1684                 struct cgroup_file *cfile = (void *)css + cft->file_offset;
1685
1686                 spin_lock_irq(&cgroup_file_kn_lock);
1687                 cfile->kn = NULL;
1688                 spin_unlock_irq(&cgroup_file_kn_lock);
1689
1690                 del_timer_sync(&cfile->notify_timer);
1691         }
1692
1693         kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1694 }
1695
1696 /**
1697  * css_clear_dir - remove subsys files in a cgroup directory
1698  * @css: target css
1699  */
1700 static void css_clear_dir(struct cgroup_subsys_state *css)
1701 {
1702         struct cgroup *cgrp = css->cgroup;
1703         struct cftype *cfts;
1704
1705         if (!(css->flags & CSS_VISIBLE))
1706                 return;
1707
1708         css->flags &= ~CSS_VISIBLE;
1709
1710         if (!css->ss) {
1711                 if (cgroup_on_dfl(cgrp)) {
1712                         cgroup_addrm_files(css, cgrp,
1713                                            cgroup_base_files, false);
1714                         if (cgroup_psi_enabled())
1715                                 cgroup_addrm_files(css, cgrp,
1716                                                    cgroup_psi_files, false);
1717                 } else {
1718                         cgroup_addrm_files(css, cgrp,
1719                                            cgroup1_base_files, false);
1720                 }
1721         } else {
1722                 list_for_each_entry(cfts, &css->ss->cfts, node)
1723                         cgroup_addrm_files(css, cgrp, cfts, false);
1724         }
1725 }
1726
1727 /**
1728  * css_populate_dir - create subsys files in a cgroup directory
1729  * @css: target css
1730  *
1731  * On failure, no file is added.
1732  */
1733 static int css_populate_dir(struct cgroup_subsys_state *css)
1734 {
1735         struct cgroup *cgrp = css->cgroup;
1736         struct cftype *cfts, *failed_cfts;
1737         int ret;
1738
1739         if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
1740                 return 0;
1741
1742         if (!css->ss) {
1743                 if (cgroup_on_dfl(cgrp)) {
1744                         ret = cgroup_addrm_files(&cgrp->self, cgrp,
1745                                                  cgroup_base_files, true);
1746                         if (ret < 0)
1747                                 return ret;
1748
1749                         if (cgroup_psi_enabled()) {
1750                                 ret = cgroup_addrm_files(&cgrp->self, cgrp,
1751                                                          cgroup_psi_files, true);
1752                                 if (ret < 0)
1753                                         return ret;
1754                         }
1755                 } else {
1756                         cgroup_addrm_files(css, cgrp,
1757                                            cgroup1_base_files, true);
1758                 }
1759         } else {
1760                 list_for_each_entry(cfts, &css->ss->cfts, node) {
1761                         ret = cgroup_addrm_files(css, cgrp, cfts, true);
1762                         if (ret < 0) {
1763                                 failed_cfts = cfts;
1764                                 goto err;
1765                         }
1766                 }
1767         }
1768
1769         css->flags |= CSS_VISIBLE;
1770
1771         return 0;
1772 err:
1773         list_for_each_entry(cfts, &css->ss->cfts, node) {
1774                 if (cfts == failed_cfts)
1775                         break;
1776                 cgroup_addrm_files(css, cgrp, cfts, false);
1777         }
1778         return ret;
1779 }
1780
1781 int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1782 {
1783         struct cgroup *dcgrp = &dst_root->cgrp;
1784         struct cgroup_subsys *ss;
1785         int ssid, ret;
1786         u16 dfl_disable_ss_mask = 0;
1787
1788         lockdep_assert_held(&cgroup_mutex);
1789
1790         do_each_subsys_mask(ss, ssid, ss_mask) {
1791                 /*
1792                  * If @ss has non-root csses attached to it, can't move.
1793                  * If @ss is an implicit controller, it is exempt from this
1794                  * rule and can be stolen.
1795                  */
1796                 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
1797                     !ss->implicit_on_dfl)
1798                         return -EBUSY;
1799
1800                 /* can't move between two non-dummy roots either */
1801                 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1802                         return -EBUSY;
1803
1804                 /*
1805                  * Collect ssid's that need to be disabled from default
1806                  * hierarchy.
1807                  */
1808                 if (ss->root == &cgrp_dfl_root)
1809                         dfl_disable_ss_mask |= 1 << ssid;
1810
1811         } while_each_subsys_mask();
1812
1813         if (dfl_disable_ss_mask) {
1814                 struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
1815
1816                 /*
1817                  * Controllers from default hierarchy that need to be rebound
1818                  * are all disabled together in one go.
1819                  */
1820                 cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
1821                 WARN_ON(cgroup_apply_control(scgrp));
1822                 cgroup_finalize_control(scgrp, 0);
1823         }
1824
1825         do_each_subsys_mask(ss, ssid, ss_mask) {
1826                 struct cgroup_root *src_root = ss->root;
1827                 struct cgroup *scgrp = &src_root->cgrp;
1828                 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1829                 struct css_set *cset, *cset_pos;
1830                 struct css_task_iter *it;
1831
1832                 WARN_ON(!css || cgroup_css(dcgrp, ss));
1833
1834                 if (src_root != &cgrp_dfl_root) {
1835                         /* disable from the source */
1836                         src_root->subsys_mask &= ~(1 << ssid);
1837                         WARN_ON(cgroup_apply_control(scgrp));
1838                         cgroup_finalize_control(scgrp, 0);
1839                 }
1840
1841                 /* rebind */
1842                 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1843                 rcu_assign_pointer(dcgrp->subsys[ssid], css);
1844                 ss->root = dst_root;
1845                 css->cgroup = dcgrp;
1846
1847                 spin_lock_irq(&css_set_lock);
1848                 WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
1849                 list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
1850                                          e_cset_node[ss->id]) {
1851                         list_move_tail(&cset->e_cset_node[ss->id],
1852                                        &dcgrp->e_csets[ss->id]);
1853                         /*
1854                          * all css_sets of scgrp together in same order to dcgrp,
1855                          * patch in-flight iterators to preserve correct iteration.
1856                          * since the iterator is always advanced right away and
1857                          * finished when it->cset_pos meets it->cset_head, so only
1858                          * update it->cset_head is enough here.
1859                          */
1860                         list_for_each_entry(it, &cset->task_iters, iters_node)
1861                                 if (it->cset_head == &scgrp->e_csets[ss->id])
1862                                         it->cset_head = &dcgrp->e_csets[ss->id];
1863                 }
1864                 spin_unlock_irq(&css_set_lock);
1865
1866                 if (ss->css_rstat_flush) {
1867                         list_del_rcu(&css->rstat_css_node);
1868                         synchronize_rcu();
1869                         list_add_rcu(&css->rstat_css_node,
1870                                      &dcgrp->rstat_css_list);
1871                 }
1872
1873                 /* default hierarchy doesn't enable controllers by default */
1874                 dst_root->subsys_mask |= 1 << ssid;
1875                 if (dst_root == &cgrp_dfl_root) {
1876                         static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1877                 } else {
1878                         dcgrp->subtree_control |= 1 << ssid;
1879                         static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1880                 }
1881
1882                 ret = cgroup_apply_control(dcgrp);
1883                 if (ret)
1884                         pr_warn("partial failure to rebind %s controller (err=%d)\n",
1885                                 ss->name, ret);
1886
1887                 if (ss->bind)
1888                         ss->bind(css);
1889         } while_each_subsys_mask();
1890
1891         kernfs_activate(dcgrp->kn);
1892         return 0;
1893 }
1894
1895 int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1896                      struct kernfs_root *kf_root)
1897 {
1898         int len = 0;
1899         char *buf = NULL;
1900         struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
1901         struct cgroup *ns_cgroup;
1902
1903         buf = kmalloc(PATH_MAX, GFP_KERNEL);
1904         if (!buf)
1905                 return -ENOMEM;
1906
1907         spin_lock_irq(&css_set_lock);
1908         ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
1909         len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
1910         spin_unlock_irq(&css_set_lock);
1911
1912         if (len >= PATH_MAX)
1913                 len = -ERANGE;
1914         else if (len > 0) {
1915                 seq_escape(sf, buf, " \t\n\\");
1916                 len = 0;
1917         }
1918         kfree(buf);
1919         return len;
1920 }
1921
1922 enum cgroup2_param {
1923         Opt_nsdelegate,
1924         Opt_favordynmods,
1925         Opt_memory_localevents,
1926         Opt_memory_recursiveprot,
1927         nr__cgroup2_params
1928 };
1929
1930 static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
1931         fsparam_flag("nsdelegate",              Opt_nsdelegate),
1932         fsparam_flag("favordynmods",            Opt_favordynmods),
1933         fsparam_flag("memory_localevents",      Opt_memory_localevents),
1934         fsparam_flag("memory_recursiveprot",    Opt_memory_recursiveprot),
1935         {}
1936 };
1937
1938 static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
1939 {
1940         struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1941         struct fs_parse_result result;
1942         int opt;
1943
1944         opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
1945         if (opt < 0)
1946                 return opt;
1947
1948         switch (opt) {
1949         case Opt_nsdelegate:
1950                 ctx->flags |= CGRP_ROOT_NS_DELEGATE;
1951                 return 0;
1952         case Opt_favordynmods:
1953                 ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
1954                 return 0;
1955         case Opt_memory_localevents:
1956                 ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1957                 return 0;
1958         case Opt_memory_recursiveprot:
1959                 ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1960                 return 0;
1961         }
1962         return -EINVAL;
1963 }
1964
1965 static void apply_cgroup_root_flags(unsigned int root_flags)
1966 {
1967         if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
1968                 if (root_flags & CGRP_ROOT_NS_DELEGATE)
1969                         cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1970                 else
1971                         cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1972
1973                 cgroup_favor_dynmods(&cgrp_dfl_root,
1974                                      root_flags & CGRP_ROOT_FAVOR_DYNMODS);
1975
1976                 if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1977                         cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1978                 else
1979                         cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1980
1981                 if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1982                         cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1983                 else
1984                         cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1985         }
1986 }
1987
1988 static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
1989 {
1990         if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1991                 seq_puts(seq, ",nsdelegate");
1992         if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS)
1993                 seq_puts(seq, ",favordynmods");
1994         if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1995                 seq_puts(seq, ",memory_localevents");
1996         if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1997                 seq_puts(seq, ",memory_recursiveprot");
1998         return 0;
1999 }
2000
2001 static int cgroup_reconfigure(struct fs_context *fc)
2002 {
2003         struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2004
2005         apply_cgroup_root_flags(ctx->flags);
2006         return 0;
2007 }
2008
2009 static void init_cgroup_housekeeping(struct cgroup *cgrp)
2010 {
2011         struct cgroup_subsys *ss;
2012         int ssid;
2013
2014         INIT_LIST_HEAD(&cgrp->self.sibling);
2015         INIT_LIST_HEAD(&cgrp->self.children);
2016         INIT_LIST_HEAD(&cgrp->cset_links);
2017         INIT_LIST_HEAD(&cgrp->pidlists);
2018         mutex_init(&cgrp->pidlist_mutex);
2019         cgrp->self.cgroup = cgrp;
2020         cgrp->self.flags |= CSS_ONLINE;
2021         cgrp->dom_cgrp = cgrp;
2022         cgrp->max_descendants = INT_MAX;
2023         cgrp->max_depth = INT_MAX;
2024         INIT_LIST_HEAD(&cgrp->rstat_css_list);
2025         prev_cputime_init(&cgrp->prev_cputime);
2026
2027         for_each_subsys(ss, ssid)
2028                 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
2029
2030         init_waitqueue_head(&cgrp->offline_waitq);
2031         INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
2032 }
2033
2034 void init_cgroup_root(struct cgroup_fs_context *ctx)
2035 {
2036         struct cgroup_root *root = ctx->root;
2037         struct cgroup *cgrp = &root->cgrp;
2038
2039         INIT_LIST_HEAD(&root->root_list);
2040         atomic_set(&root->nr_cgrps, 1);
2041         cgrp->root = root;
2042         init_cgroup_housekeeping(cgrp);
2043
2044         /* DYNMODS must be modified through cgroup_favor_dynmods() */
2045         root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS;
2046         if (ctx->release_agent)
2047                 strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
2048         if (ctx->name)
2049                 strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
2050         if (ctx->cpuset_clone_children)
2051                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
2052 }
2053
2054 int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
2055 {
2056         LIST_HEAD(tmp_links);
2057         struct cgroup *root_cgrp = &root->cgrp;
2058         struct kernfs_syscall_ops *kf_sops;
2059         struct css_set *cset;
2060         int i, ret;
2061
2062         lockdep_assert_held(&cgroup_mutex);
2063
2064         ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
2065                               0, GFP_KERNEL);
2066         if (ret)
2067                 goto out;
2068
2069         /*
2070          * We're accessing css_set_count without locking css_set_lock here,
2071          * but that's OK - it can only be increased by someone holding
2072          * cgroup_lock, and that's us.  Later rebinding may disable
2073          * controllers on the default hierarchy and thus create new csets,
2074          * which can't be more than the existing ones.  Allocate 2x.
2075          */
2076         ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
2077         if (ret)
2078                 goto cancel_ref;
2079
2080         ret = cgroup_init_root_id(root);
2081         if (ret)
2082                 goto cancel_ref;
2083
2084         kf_sops = root == &cgrp_dfl_root ?
2085                 &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
2086
2087         root->kf_root = kernfs_create_root(kf_sops,
2088                                            KERNFS_ROOT_CREATE_DEACTIVATED |
2089                                            KERNFS_ROOT_SUPPORT_EXPORTOP |
2090                                            KERNFS_ROOT_SUPPORT_USER_XATTR,
2091                                            root_cgrp);
2092         if (IS_ERR(root->kf_root)) {
2093                 ret = PTR_ERR(root->kf_root);
2094                 goto exit_root_id;
2095         }
2096         root_cgrp->kn = kernfs_root_to_node(root->kf_root);
2097         WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
2098         root_cgrp->ancestors[0] = root_cgrp;
2099
2100         ret = css_populate_dir(&root_cgrp->self);
2101         if (ret)
2102                 goto destroy_root;
2103
2104         ret = cgroup_rstat_init(root_cgrp);
2105         if (ret)
2106                 goto destroy_root;
2107
2108         ret = rebind_subsystems(root, ss_mask);
2109         if (ret)
2110                 goto exit_stats;
2111
2112         ret = cgroup_bpf_inherit(root_cgrp);
2113         WARN_ON_ONCE(ret);
2114
2115         trace_cgroup_setup_root(root);
2116
2117         /*
2118          * There must be no failure case after here, since rebinding takes
2119          * care of subsystems' refcounts, which are explicitly dropped in
2120          * the failure exit path.
2121          */
2122         list_add(&root->root_list, &cgroup_roots);
2123         cgroup_root_count++;
2124
2125         /*
2126          * Link the root cgroup in this hierarchy into all the css_set
2127          * objects.
2128          */
2129         spin_lock_irq(&css_set_lock);
2130         hash_for_each(css_set_table, i, cset, hlist) {
2131                 link_css_set(&tmp_links, cset, root_cgrp);
2132                 if (css_set_populated(cset))
2133                         cgroup_update_populated(root_cgrp, true);
2134         }
2135         spin_unlock_irq(&css_set_lock);
2136
2137         BUG_ON(!list_empty(&root_cgrp->self.children));
2138         BUG_ON(atomic_read(&root->nr_cgrps) != 1);
2139
2140         ret = 0;
2141         goto out;
2142
2143 exit_stats:
2144         cgroup_rstat_exit(root_cgrp);
2145 destroy_root:
2146         kernfs_destroy_root(root->kf_root);
2147         root->kf_root = NULL;
2148 exit_root_id:
2149         cgroup_exit_root_id(root);
2150 cancel_ref:
2151         percpu_ref_exit(&root_cgrp->self.refcnt);
2152 out:
2153         free_cgrp_cset_links(&tmp_links);
2154         return ret;
2155 }
2156
2157 int cgroup_do_get_tree(struct fs_context *fc)
2158 {
2159         struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2160         int ret;
2161
2162         ctx->kfc.root = ctx->root->kf_root;
2163         if (fc->fs_type == &cgroup2_fs_type)
2164                 ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
2165         else
2166                 ctx->kfc.magic = CGROUP_SUPER_MAGIC;
2167         ret = kernfs_get_tree(fc);
2168
2169         /*
2170          * In non-init cgroup namespace, instead of root cgroup's dentry,
2171          * we return the dentry corresponding to the cgroupns->root_cgrp.
2172          */
2173         if (!ret && ctx->ns != &init_cgroup_ns) {
2174                 struct dentry *nsdentry;
2175                 struct super_block *sb = fc->root->d_sb;
2176                 struct cgroup *cgrp;
2177
2178                 cgroup_lock();
2179                 spin_lock_irq(&css_set_lock);
2180
2181                 cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
2182
2183                 spin_unlock_irq(&css_set_lock);
2184                 cgroup_unlock();
2185
2186                 nsdentry = kernfs_node_dentry(cgrp->kn, sb);
2187                 dput(fc->root);
2188                 if (IS_ERR(nsdentry)) {
2189                         deactivate_locked_super(sb);
2190                         ret = PTR_ERR(nsdentry);
2191                         nsdentry = NULL;
2192                 }
2193                 fc->root = nsdentry;
2194         }
2195
2196         if (!ctx->kfc.new_sb_created)
2197                 cgroup_put(&ctx->root->cgrp);
2198
2199         return ret;
2200 }
2201
2202 /*
2203  * Destroy a cgroup filesystem context.
2204  */
2205 static void cgroup_fs_context_free(struct fs_context *fc)
2206 {
2207         struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2208
2209         kfree(ctx->name);
2210         kfree(ctx->release_agent);
2211         put_cgroup_ns(ctx->ns);
2212         kernfs_free_fs_context(fc);
2213         kfree(ctx);
2214 }
2215
2216 static int cgroup_get_tree(struct fs_context *fc)
2217 {
2218         struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2219         int ret;
2220
2221         WRITE_ONCE(cgrp_dfl_visible, true);
2222         cgroup_get_live(&cgrp_dfl_root.cgrp);
2223         ctx->root = &cgrp_dfl_root;
2224
2225         ret = cgroup_do_get_tree(fc);
2226         if (!ret)
2227                 apply_cgroup_root_flags(ctx->flags);
2228         return ret;
2229 }
2230
2231 static const struct fs_context_operations cgroup_fs_context_ops = {
2232         .free           = cgroup_fs_context_free,
2233         .parse_param    = cgroup2_parse_param,
2234         .get_tree       = cgroup_get_tree,
2235         .reconfigure    = cgroup_reconfigure,
2236 };
2237
2238 static const struct fs_context_operations cgroup1_fs_context_ops = {
2239         .free           = cgroup_fs_context_free,
2240         .parse_param    = cgroup1_parse_param,
2241         .get_tree       = cgroup1_get_tree,
2242         .reconfigure    = cgroup1_reconfigure,
2243 };
2244
2245 /*
2246  * Initialise the cgroup filesystem creation/reconfiguration context.  Notably,
2247  * we select the namespace we're going to use.
2248  */
2249 static int cgroup_init_fs_context(struct fs_context *fc)
2250 {
2251         struct cgroup_fs_context *ctx;
2252
2253         ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
2254         if (!ctx)
2255                 return -ENOMEM;
2256
2257         ctx->ns = current->nsproxy->cgroup_ns;
2258         get_cgroup_ns(ctx->ns);
2259         fc->fs_private = &ctx->kfc;
2260         if (fc->fs_type == &cgroup2_fs_type)
2261                 fc->ops = &cgroup_fs_context_ops;
2262         else
2263                 fc->ops = &cgroup1_fs_context_ops;
2264         put_user_ns(fc->user_ns);
2265         fc->user_ns = get_user_ns(ctx->ns->user_ns);
2266         fc->global = true;
2267
2268 #ifdef CONFIG_CGROUP_FAVOR_DYNMODS
2269         ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
2270 #endif
2271         return 0;
2272 }
2273
2274 static void cgroup_kill_sb(struct super_block *sb)
2275 {
2276         struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
2277         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
2278
2279         /*
2280          * If @root doesn't have any children, start killing it.
2281          * This prevents new mounts by disabling percpu_ref_tryget_live().
2282          *
2283          * And don't kill the default root.
2284          */
2285         if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
2286             !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
2287                 cgroup_bpf_offline(&root->cgrp);
2288                 percpu_ref_kill(&root->cgrp.self.refcnt);
2289         }
2290         cgroup_put(&root->cgrp);
2291         kernfs_kill_sb(sb);
2292 }
2293
2294 struct file_system_type cgroup_fs_type = {
2295         .name                   = "cgroup",
2296         .init_fs_context        = cgroup_init_fs_context,
2297         .parameters             = cgroup1_fs_parameters,
2298         .kill_sb                = cgroup_kill_sb,
2299         .fs_flags               = FS_USERNS_MOUNT,
2300 };
2301
2302 static struct file_system_type cgroup2_fs_type = {
2303         .name                   = "cgroup2",
2304         .init_fs_context        = cgroup_init_fs_context,
2305         .parameters             = cgroup2_fs_parameters,
2306         .kill_sb                = cgroup_kill_sb,
2307         .fs_flags               = FS_USERNS_MOUNT,
2308 };
2309
2310 #ifdef CONFIG_CPUSETS
2311 static const struct fs_context_operations cpuset_fs_context_ops = {
2312         .get_tree       = cgroup1_get_tree,
2313         .free           = cgroup_fs_context_free,
2314 };
2315
2316 /*
2317  * This is ugly, but preserves the userspace API for existing cpuset
2318  * users. If someone tries to mount the "cpuset" filesystem, we
2319  * silently switch it to mount "cgroup" instead
2320  */
2321 static int cpuset_init_fs_context(struct fs_context *fc)
2322 {
2323         char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
2324         struct cgroup_fs_context *ctx;
2325         int err;
2326
2327         err = cgroup_init_fs_context(fc);
2328         if (err) {
2329                 kfree(agent);
2330                 return err;
2331         }
2332
2333         fc->ops = &cpuset_fs_context_ops;
2334
2335         ctx = cgroup_fc2context(fc);
2336         ctx->subsys_mask = 1 << cpuset_cgrp_id;
2337         ctx->flags |= CGRP_ROOT_NOPREFIX;
2338         ctx->release_agent = agent;
2339
2340         get_filesystem(&cgroup_fs_type);
2341         put_filesystem(fc->fs_type);
2342         fc->fs_type = &cgroup_fs_type;
2343
2344         return 0;
2345 }
2346
2347 static struct file_system_type cpuset_fs_type = {
2348         .name                   = "cpuset",
2349         .init_fs_context        = cpuset_init_fs_context,
2350         .fs_flags               = FS_USERNS_MOUNT,
2351 };
2352 #endif
2353
2354 int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
2355                           struct cgroup_namespace *ns)
2356 {
2357         struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
2358
2359         return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
2360 }
2361
2362 int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
2363                    struct cgroup_namespace *ns)
2364 {
2365         int ret;
2366
2367         cgroup_lock();
2368         spin_lock_irq(&css_set_lock);
2369
2370         ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
2371
2372         spin_unlock_irq(&css_set_lock);
2373         cgroup_unlock();
2374
2375         return ret;
2376 }
2377 EXPORT_SYMBOL_GPL(cgroup_path_ns);
2378
2379 /**
2380  * cgroup_attach_lock - Lock for ->attach()
2381  * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
2382  *
2383  * cgroup migration sometimes needs to stabilize threadgroups against forks and
2384  * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
2385  * implementations (e.g. cpuset), also need to disable CPU hotplug.
2386  * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
2387  * lead to deadlocks.
2388  *
2389  * Bringing up a CPU may involve creating and destroying tasks which requires
2390  * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
2391  * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
2392  * write-locking threadgroup_rwsem, the locking order is reversed and we end up
2393  * waiting for an on-going CPU hotplug operation which in turn is waiting for
2394  * the threadgroup_rwsem to be released to create new tasks. For more details:
2395  *
2396  *   http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
2397  *
2398  * Resolve the situation by always acquiring cpus_read_lock() before optionally
2399  * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
2400  * CPU hotplug is disabled on entry.
2401  */
2402 void cgroup_attach_lock(bool lock_threadgroup)
2403 {
2404         cpus_read_lock();
2405         if (lock_threadgroup)
2406                 percpu_down_write(&cgroup_threadgroup_rwsem);
2407 }
2408
2409 /**
2410  * cgroup_attach_unlock - Undo cgroup_attach_lock()
2411  * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
2412  */
2413 void cgroup_attach_unlock(bool lock_threadgroup)
2414 {
2415         if (lock_threadgroup)
2416                 percpu_up_write(&cgroup_threadgroup_rwsem);
2417         cpus_read_unlock();
2418 }
2419
2420 /**
2421  * cgroup_migrate_add_task - add a migration target task to a migration context
2422  * @task: target task
2423  * @mgctx: target migration context
2424  *
2425  * Add @task, which is a migration target, to @mgctx->tset.  This function
2426  * becomes noop if @task doesn't need to be migrated.  @task's css_set
2427  * should have been added as a migration source and @task->cg_list will be
2428  * moved from the css_set's tasks list to mg_tasks one.
2429  */
2430 static void cgroup_migrate_add_task(struct task_struct *task,
2431                                     struct cgroup_mgctx *mgctx)
2432 {
2433         struct css_set *cset;
2434
2435         lockdep_assert_held(&css_set_lock);
2436
2437         /* @task either already exited or can't exit until the end */
2438         if (task->flags & PF_EXITING)
2439                 return;
2440
2441         /* cgroup_threadgroup_rwsem protects racing against forks */
2442         WARN_ON_ONCE(list_empty(&task->cg_list));
2443
2444         cset = task_css_set(task);
2445         if (!cset->mg_src_cgrp)
2446                 return;
2447
2448         mgctx->tset.nr_tasks++;
2449
2450         list_move_tail(&task->cg_list, &cset->mg_tasks);
2451         if (list_empty(&cset->mg_node))
2452                 list_add_tail(&cset->mg_node,
2453                               &mgctx->tset.src_csets);
2454         if (list_empty(&cset->mg_dst_cset->mg_node))
2455                 list_add_tail(&cset->mg_dst_cset->mg_node,
2456                               &mgctx->tset.dst_csets);
2457 }
2458
2459 /**
2460  * cgroup_taskset_first - reset taskset and return the first task
2461  * @tset: taskset of interest
2462  * @dst_cssp: output variable for the destination css
2463  *
2464  * @tset iteration is initialized and the first task is returned.
2465  */
2466 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
2467                                          struct cgroup_subsys_state **dst_cssp)
2468 {
2469         tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
2470         tset->cur_task = NULL;
2471
2472         return cgroup_taskset_next(tset, dst_cssp);
2473 }
2474
2475 /**
2476  * cgroup_taskset_next - iterate to the next task in taskset
2477  * @tset: taskset of interest
2478  * @dst_cssp: output variable for the destination css
2479  *
2480  * Return the next task in @tset.  Iteration must have been initialized
2481  * with cgroup_taskset_first().
2482  */
2483 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2484                                         struct cgroup_subsys_state **dst_cssp)
2485 {
2486         struct css_set *cset = tset->cur_cset;
2487         struct task_struct *task = tset->cur_task;
2488
2489         while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) {
2490                 if (!task)
2491                         task = list_first_entry(&cset->mg_tasks,
2492                                                 struct task_struct, cg_list);
2493                 else
2494                         task = list_next_entry(task, cg_list);
2495
2496                 if (&task->cg_list != &cset->mg_tasks) {
2497                         tset->cur_cset = cset;
2498                         tset->cur_task = task;
2499
2500                         /*
2501                          * This function may be called both before and
2502                          * after cgroup_taskset_migrate().  The two cases
2503                          * can be distinguished by looking at whether @cset
2504                          * has its ->mg_dst_cset set.
2505                          */
2506                         if (cset->mg_dst_cset)
2507                                 *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
2508                         else
2509                                 *dst_cssp = cset->subsys[tset->ssid];
2510
2511                         return task;
2512                 }
2513
2514                 cset = list_next_entry(cset, mg_node);
2515                 task = NULL;
2516         }
2517
2518         return NULL;
2519 }
2520
2521 /**
2522  * cgroup_migrate_execute - migrate a taskset
2523  * @mgctx: migration context
2524  *
2525  * Migrate tasks in @mgctx as setup by migration preparation functions.
2526  * This function fails iff one of the ->can_attach callbacks fails and
2527  * guarantees that either all or none of the tasks in @mgctx are migrated.
2528  * @mgctx is consumed regardless of success.
2529  */
2530 static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
2531 {
2532         struct cgroup_taskset *tset = &mgctx->tset;
2533         struct cgroup_subsys *ss;
2534         struct task_struct *task, *tmp_task;
2535         struct css_set *cset, *tmp_cset;
2536         int ssid, failed_ssid, ret;
2537
2538         /* check that we can legitimately attach to the cgroup */
2539         if (tset->nr_tasks) {
2540                 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2541                         if (ss->can_attach) {
2542                                 tset->ssid = ssid;
2543                                 ret = ss->can_attach(tset);
2544                                 if (ret) {
2545                                         failed_ssid = ssid;
2546                                         goto out_cancel_attach;
2547                                 }
2548                         }
2549                 } while_each_subsys_mask();
2550         }
2551
2552         /*
2553          * Now that we're guaranteed success, proceed to move all tasks to
2554          * the new cgroup.  There are no failure cases after here, so this
2555          * is the commit point.
2556          */
2557         spin_lock_irq(&css_set_lock);
2558         list_for_each_entry(cset, &tset->src_csets, mg_node) {
2559                 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2560                         struct css_set *from_cset = task_css_set(task);
2561                         struct css_set *to_cset = cset->mg_dst_cset;
2562
2563                         get_css_set(to_cset);
2564                         to_cset->nr_tasks++;
2565                         css_set_move_task(task, from_cset, to_cset, true);
2566                         from_cset->nr_tasks--;
2567                         /*
2568                          * If the source or destination cgroup is frozen,
2569                          * the task might require to change its state.
2570                          */
2571                         cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
2572                                                     to_cset->dfl_cgrp);
2573                         put_css_set_locked(from_cset);
2574
2575                 }
2576         }
2577         spin_unlock_irq(&css_set_lock);
2578
2579         /*
2580          * Migration is committed, all target tasks are now on dst_csets.
2581          * Nothing is sensitive to fork() after this point.  Notify
2582          * controllers that migration is complete.
2583          */
2584         tset->csets = &tset->dst_csets;
2585
2586         if (tset->nr_tasks) {
2587                 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2588                         if (ss->attach) {
2589                                 tset->ssid = ssid;
2590                                 ss->attach(tset);
2591                         }
2592                 } while_each_subsys_mask();
2593         }
2594
2595         ret = 0;
2596         goto out_release_tset;
2597
2598 out_cancel_attach:
2599         if (tset->nr_tasks) {
2600                 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2601                         if (ssid == failed_ssid)
2602                                 break;
2603                         if (ss->cancel_attach) {
2604                                 tset->ssid = ssid;
2605                                 ss->cancel_attach(tset);
2606                         }
2607                 } while_each_subsys_mask();
2608         }
2609 out_release_tset:
2610         spin_lock_irq(&css_set_lock);
2611         list_splice_init(&tset->dst_csets, &tset->src_csets);
2612         list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2613                 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2614                 list_del_init(&cset->mg_node);
2615         }
2616         spin_unlock_irq(&css_set_lock);
2617
2618         /*
2619          * Re-initialize the cgroup_taskset structure in case it is reused
2620          * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
2621          * iteration.
2622          */
2623         tset->nr_tasks = 0;
2624         tset->csets    = &tset->src_csets;
2625         return ret;
2626 }
2627
2628 /**
2629  * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
2630  * @dst_cgrp: destination cgroup to test
2631  *
2632  * On the default hierarchy, except for the mixable, (possible) thread root
2633  * and threaded cgroups, subtree_control must be zero for migration
2634  * destination cgroups with tasks so that child cgroups don't compete
2635  * against tasks.
2636  */
2637 int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
2638 {
2639         /* v1 doesn't have any restriction */
2640         if (!cgroup_on_dfl(dst_cgrp))
2641                 return 0;
2642
2643         /* verify @dst_cgrp can host resources */
2644         if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
2645                 return -EOPNOTSUPP;
2646
2647         /*
2648          * If @dst_cgrp is already or can become a thread root or is
2649          * threaded, it doesn't matter.
2650          */
2651         if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
2652                 return 0;
2653
2654         /* apply no-internal-process constraint */
2655         if (dst_cgrp->subtree_control)
2656                 return -EBUSY;
2657
2658         return 0;
2659 }
2660
2661 /**
2662  * cgroup_migrate_finish - cleanup after attach
2663  * @mgctx: migration context
2664  *
2665  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
2666  * those functions for details.
2667  */
2668 void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
2669 {
2670         struct css_set *cset, *tmp_cset;
2671
2672         lockdep_assert_held(&cgroup_mutex);
2673
2674         spin_lock_irq(&css_set_lock);
2675
2676         list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
2677                                  mg_src_preload_node) {
2678                 cset->mg_src_cgrp = NULL;
2679                 cset->mg_dst_cgrp = NULL;
2680                 cset->mg_dst_cset = NULL;
2681                 list_del_init(&cset->mg_src_preload_node);
2682                 put_css_set_locked(cset);
2683         }
2684
2685         list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
2686                                  mg_dst_preload_node) {
2687                 cset->mg_src_cgrp = NULL;
2688                 cset->mg_dst_cgrp = NULL;
2689                 cset->mg_dst_cset = NULL;
2690                 list_del_init(&cset->mg_dst_preload_node);
2691                 put_css_set_locked(cset);
2692         }
2693
2694         spin_unlock_irq(&css_set_lock);
2695 }
2696
2697 /**
2698  * cgroup_migrate_add_src - add a migration source css_set
2699  * @src_cset: the source css_set to add
2700  * @dst_cgrp: the destination cgroup
2701  * @mgctx: migration context
2702  *
2703  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
2704  * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
2705  * up by cgroup_migrate_finish().
2706  *
2707  * This function may be called without holding cgroup_threadgroup_rwsem
2708  * even if the target is a process.  Threads may be created and destroyed
2709  * but as long as cgroup_mutex is not dropped, no new css_set can be put
2710  * into play and the preloaded css_sets are guaranteed to cover all
2711  * migrations.
2712  */
2713 void cgroup_migrate_add_src(struct css_set *src_cset,
2714                             struct cgroup *dst_cgrp,
2715                             struct cgroup_mgctx *mgctx)
2716 {
2717         struct cgroup *src_cgrp;
2718
2719         lockdep_assert_held(&cgroup_mutex);
2720         lockdep_assert_held(&css_set_lock);
2721
2722         /*
2723          * If ->dead, @src_set is associated with one or more dead cgroups
2724          * and doesn't contain any migratable tasks.  Ignore it early so
2725          * that the rest of migration path doesn't get confused by it.
2726          */
2727         if (src_cset->dead)
2728                 return;
2729
2730         if (!list_empty(&src_cset->mg_src_preload_node))
2731                 return;
2732
2733         src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2734
2735         WARN_ON(src_cset->mg_src_cgrp);
2736         WARN_ON(src_cset->mg_dst_cgrp);
2737         WARN_ON(!list_empty(&src_cset->mg_tasks));
2738         WARN_ON(!list_empty(&src_cset->mg_node));
2739
2740         src_cset->mg_src_cgrp = src_cgrp;
2741         src_cset->mg_dst_cgrp = dst_cgrp;
2742         get_css_set(src_cset);
2743         list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
2744 }
2745
2746 /**
2747  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
2748  * @mgctx: migration context
2749  *
2750  * Tasks are about to be moved and all the source css_sets have been
2751  * preloaded to @mgctx->preloaded_src_csets.  This function looks up and
2752  * pins all destination css_sets, links each to its source, and append them
2753  * to @mgctx->preloaded_dst_csets.
2754  *
2755  * This function must be called after cgroup_migrate_add_src() has been
2756  * called on each migration source css_set.  After migration is performed
2757  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
2758  * @mgctx.
2759  */
2760 int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
2761 {
2762         struct css_set *src_cset, *tmp_cset;
2763
2764         lockdep_assert_held(&cgroup_mutex);
2765
2766         /* look up the dst cset for each src cset and link it to src */
2767         list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2768                                  mg_src_preload_node) {
2769                 struct css_set *dst_cset;
2770                 struct cgroup_subsys *ss;
2771                 int ssid;
2772
2773                 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2774                 if (!dst_cset)
2775                         return -ENOMEM;
2776
2777                 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2778
2779                 /*
2780                  * If src cset equals dst, it's noop.  Drop the src.
2781                  * cgroup_migrate() will skip the cset too.  Note that we
2782                  * can't handle src == dst as some nodes are used by both.
2783                  */
2784                 if (src_cset == dst_cset) {
2785                         src_cset->mg_src_cgrp = NULL;
2786                         src_cset->mg_dst_cgrp = NULL;
2787                         list_del_init(&src_cset->mg_src_preload_node);
2788                         put_css_set(src_cset);
2789                         put_css_set(dst_cset);
2790                         continue;
2791                 }
2792
2793                 src_cset->mg_dst_cset = dst_cset;
2794
2795                 if (list_empty(&dst_cset->mg_dst_preload_node))
2796                         list_add_tail(&dst_cset->mg_dst_preload_node,
2797                                       &mgctx->preloaded_dst_csets);
2798                 else
2799                         put_css_set(dst_cset);
2800
2801                 for_each_subsys(ss, ssid)
2802                         if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
2803                                 mgctx->ss_mask |= 1 << ssid;
2804         }
2805
2806         return 0;
2807 }
2808
2809 /**
2810  * cgroup_migrate - migrate a process or task to a cgroup
2811  * @leader: the leader of the process or the task to migrate
2812  * @threadgroup: whether @leader points to the whole process or a single task
2813  * @mgctx: migration context
2814  *
2815  * Migrate a process or task denoted by @leader.  If migrating a process,
2816  * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
2817  * responsible for invoking cgroup_migrate_add_src() and
2818  * cgroup_migrate_prepare_dst() on the targets before invoking this
2819  * function and following up with cgroup_migrate_finish().
2820  *
2821  * As long as a controller's ->can_attach() doesn't fail, this function is
2822  * guaranteed to succeed.  This means that, excluding ->can_attach()
2823  * failure, when migrating multiple targets, the success or failure can be
2824  * decided for all targets by invoking group_migrate_prepare_dst() before
2825  * actually starting migrating.
2826  */
2827 int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2828                    struct cgroup_mgctx *mgctx)
2829 {
2830         struct task_struct *task;
2831
2832         /*
2833          * The following thread iteration should be inside an RCU critical
2834          * section to prevent tasks from being freed while taking the snapshot.
2835          * spin_lock_irq() implies RCU critical section here.
2836          */
2837         spin_lock_irq(&css_set_lock);
2838         task = leader;
2839         do {
2840                 cgroup_migrate_add_task(task, mgctx);
2841                 if (!threadgroup)
2842                         break;
2843         } while_each_thread(leader, task);
2844         spin_unlock_irq(&css_set_lock);
2845
2846         return cgroup_migrate_execute(mgctx);
2847 }
2848
2849 /**
2850  * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2851  * @dst_cgrp: the cgroup to attach to
2852  * @leader: the task or the leader of the threadgroup to be attached
2853  * @threadgroup: attach the whole threadgroup?
2854  *
2855  * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
2856  */
2857 int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2858                        bool threadgroup)
2859 {
2860         DEFINE_CGROUP_MGCTX(mgctx);
2861         struct task_struct *task;
2862         int ret = 0;
2863
2864         /* look up all src csets */
2865         spin_lock_irq(&css_set_lock);
2866         rcu_read_lock();
2867         task = leader;
2868         do {
2869                 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
2870                 if (!threadgroup)
2871                         break;
2872         } while_each_thread(leader, task);
2873         rcu_read_unlock();
2874         spin_unlock_irq(&css_set_lock);
2875
2876         /* prepare dst csets and commit */
2877         ret = cgroup_migrate_prepare_dst(&mgctx);
2878         if (!ret)
2879                 ret = cgroup_migrate(leader, threadgroup, &mgctx);
2880
2881         cgroup_migrate_finish(&mgctx);
2882
2883         if (!ret)
2884                 TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);
2885
2886         return ret;
2887 }
2888
2889 struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
2890                                              bool *threadgroup_locked)
2891 {
2892         struct task_struct *tsk;
2893         pid_t pid;
2894
2895         if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2896                 return ERR_PTR(-EINVAL);
2897
2898         /*
2899          * If we migrate a single thread, we don't care about threadgroup
2900          * stability. If the thread is `current`, it won't exit(2) under our
2901          * hands or change PID through exec(2). We exclude
2902          * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
2903          * callers by cgroup_mutex.
2904          * Therefore, we can skip the global lock.
2905          */
2906         lockdep_assert_held(&cgroup_mutex);
2907         *threadgroup_locked = pid || threadgroup;
2908         cgroup_attach_lock(*threadgroup_locked);
2909
2910         rcu_read_lock();
2911         if (pid) {
2912                 tsk = find_task_by_vpid(pid);
2913                 if (!tsk) {
2914                         tsk = ERR_PTR(-ESRCH);
2915                         goto out_unlock_threadgroup;
2916                 }
2917         } else {
2918                 tsk = current;
2919         }
2920
2921         if (threadgroup)
2922                 tsk = tsk->group_leader;
2923
2924         /*
2925          * kthreads may acquire PF_NO_SETAFFINITY during initialization.
2926          * If userland migrates such a kthread to a non-root cgroup, it can
2927          * become trapped in a cpuset, or RT kthread may be born in a
2928          * cgroup with no rt_runtime allocated.  Just say no.
2929          */
2930         if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2931                 tsk = ERR_PTR(-EINVAL);
2932                 goto out_unlock_threadgroup;
2933         }
2934
2935         get_task_struct(tsk);
2936         goto out_unlock_rcu;
2937
2938 out_unlock_threadgroup:
2939         cgroup_attach_unlock(*threadgroup_locked);
2940         *threadgroup_locked = false;
2941 out_unlock_rcu:
2942         rcu_read_unlock();
2943         return tsk;
2944 }
2945
2946 void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
2947 {
2948         struct cgroup_subsys *ss;
2949         int ssid;
2950
2951         /* release reference from cgroup_procs_write_start() */
2952         put_task_struct(task);
2953
2954         cgroup_attach_unlock(threadgroup_locked);
2955
2956         for_each_subsys(ss, ssid)
2957                 if (ss->post_attach)
2958                         ss->post_attach();
2959 }
2960
2961 static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
2962 {
2963         struct cgroup_subsys *ss;
2964         bool printed = false;
2965         int ssid;
2966
2967         do_each_subsys_mask(ss, ssid, ss_mask) {
2968                 if (printed)
2969                         seq_putc(seq, ' ');
2970                 seq_puts(seq, ss->name);
2971                 printed = true;
2972         } while_each_subsys_mask();
2973         if (printed)
2974                 seq_putc(seq, '\n');
2975 }
2976
2977 /* show controllers which are enabled from the parent */
2978 static int cgroup_controllers_show(struct seq_file *seq, void *v)
2979 {
2980         struct cgroup *cgrp = seq_css(seq)->cgroup;
2981
2982         cgroup_print_ss_mask(seq, cgroup_control(cgrp));
2983         return 0;
2984 }
2985
2986 /* show controllers which are enabled for a given cgroup's children */
2987 static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2988 {
2989         struct cgroup *cgrp = seq_css(seq)->cgroup;
2990
2991         cgroup_print_ss_mask(seq, cgrp->subtree_control);
2992         return 0;
2993 }
2994
2995 /**
2996  * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
2997  * @cgrp: root of the subtree to update csses for
2998  *
2999  * @cgrp's control masks have changed and its subtree's css associations
3000  * need to be updated accordingly.  This function looks up all css_sets
3001  * which are attached to the subtree, creates the matching updated css_sets
3002  * and migrates the tasks to the new ones.
3003  */
3004 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
3005 {
3006         DEFINE_CGROUP_MGCTX(mgctx);
3007         struct cgroup_subsys_state *d_css;
3008         struct cgroup *dsct;
3009         struct css_set *src_cset;
3010         bool has_tasks;
3011         int ret;
3012
3013         lockdep_assert_held(&cgroup_mutex);
3014
3015         /* look up all csses currently attached to @cgrp's subtree */
3016         spin_lock_irq(&css_set_lock);
3017         cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3018                 struct cgrp_cset_link *link;
3019
3020                 /*
3021                  * As cgroup_update_dfl_csses() is only called by
3022                  * cgroup_apply_control(). The csses associated with the
3023                  * given cgrp will not be affected by changes made to
3024                  * its subtree_control file. We can skip them.
3025                  */
3026                 if (dsct == cgrp)
3027                         continue;
3028
3029                 list_for_each_entry(link, &dsct->cset_links, cset_link)
3030                         cgroup_migrate_add_src(link->cset, dsct, &mgctx);
3031         }
3032         spin_unlock_irq(&css_set_lock);
3033
3034         /*
3035          * We need to write-lock threadgroup_rwsem while migrating tasks.
3036          * However, if there are no source csets for @cgrp, changing its
3037          * controllers isn't gonna produce any task migrations and the
3038          * write-locking can be skipped safely.
3039          */
3040         has_tasks = !list_empty(&mgctx.preloaded_src_csets);
3041         cgroup_attach_lock(has_tasks);
3042
3043         /* NULL dst indicates self on default hierarchy */
3044         ret = cgroup_migrate_prepare_dst(&mgctx);
3045         if (ret)
3046                 goto out_finish;
3047
3048         spin_lock_irq(&css_set_lock);
3049         list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
3050                             mg_src_preload_node) {
3051                 struct task_struct *task, *ntask;
3052
3053                 /* all tasks in src_csets need to be migrated */
3054                 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
3055                         cgroup_migrate_add_task(task, &mgctx);
3056         }
3057         spin_unlock_irq(&css_set_lock);
3058
3059         ret = cgroup_migrate_execute(&mgctx);
3060 out_finish:
3061         cgroup_migrate_finish(&mgctx);
3062         cgroup_attach_unlock(has_tasks);
3063         return ret;
3064 }
3065
3066 /**
3067  * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
3068  * @cgrp: root of the target subtree
3069  *
3070  * Because css offlining is asynchronous, userland may try to re-enable a
3071  * controller while the previous css is still around.  This function grabs
3072  * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
3073  */
3074 void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
3075         __acquires(&cgroup_mutex)
3076 {
3077         struct cgroup *dsct;
3078         struct cgroup_subsys_state *d_css;
3079         struct cgroup_subsys *ss;
3080         int ssid;
3081
3082 restart:
3083         cgroup_lock();
3084
3085         cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3086                 for_each_subsys(ss, ssid) {
3087                         struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3088                         DEFINE_WAIT(wait);
3089
3090                         if (!css || !percpu_ref_is_dying(&css->refcnt))
3091                                 continue;
3092
3093                         cgroup_get_live(dsct);
3094                         prepare_to_wait(&dsct->offline_waitq, &wait,
3095                                         TASK_UNINTERRUPTIBLE);
3096
3097                         cgroup_unlock();
3098                         schedule();
3099                         finish_wait(&dsct->offline_waitq, &wait);
3100
3101                         cgroup_put(dsct);
3102                         goto restart;
3103                 }
3104         }
3105 }
3106
3107 /**
3108  * cgroup_save_control - save control masks and dom_cgrp of a subtree
3109  * @cgrp: root of the target subtree
3110  *
3111  * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the
3112  * respective old_ prefixed fields for @cgrp's subtree including @cgrp
3113  * itself.
3114  */
3115 static void cgroup_save_control(struct cgroup *cgrp)
3116 {
3117         struct cgroup *dsct;
3118         struct cgroup_subsys_state *d_css;
3119
3120         cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3121                 dsct->old_subtree_control = dsct->subtree_control;
3122                 dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
3123                 dsct->old_dom_cgrp = dsct->dom_cgrp;
3124         }
3125 }
3126
3127 /**
3128  * cgroup_propagate_control - refresh control masks of a subtree
3129  * @cgrp: root of the target subtree
3130  *
3131  * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
3132  * ->subtree_control and propagate controller availability through the
3133  * subtree so that descendants don't have unavailable controllers enabled.
3134  */
3135 static void cgroup_propagate_control(struct cgroup *cgrp)
3136 {
3137         struct cgroup *dsct;
3138         struct cgroup_subsys_state *d_css;
3139
3140         cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3141                 dsct->subtree_control &= cgroup_control(dsct);
3142                 dsct->subtree_ss_mask =
3143                         cgroup_calc_subtree_ss_mask(dsct->subtree_control,
3144                                                     cgroup_ss_mask(dsct));
3145         }
3146 }
3147
3148 /**
3149  * cgroup_restore_control - restore control masks and dom_cgrp of a subtree
3150  * @cgrp: root of the target subtree
3151  *
3152  * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the
3153  * respective old_ prefixed fields for @cgrp's subtree including @cgrp
3154  * itself.
3155  */
3156 static void cgroup_restore_control(struct cgroup *cgrp)
3157 {
3158         struct cgroup *dsct;
3159         struct cgroup_subsys_state *d_css;
3160
3161         cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3162                 dsct->subtree_control = dsct->old_subtree_control;
3163                 dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
3164                 dsct->dom_cgrp = dsct->old_dom_cgrp;
3165         }
3166 }
3167
3168 static bool css_visible(struct cgroup_subsys_state *css)
3169 {
3170         struct cgroup_subsys *ss = css->ss;
3171         struct cgroup *cgrp = css->cgroup;
3172
3173         if (cgroup_control(cgrp) & (1 << ss->id))
3174                 return true;
3175         if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
3176                 return false;
3177         return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
3178 }
3179
3180 /**
3181  * cgroup_apply_control_enable - enable or show csses according to control
3182  * @cgrp: root of the target subtree
3183  *
3184  * Walk @cgrp's subtree and create new csses or make the existing ones
3185  * visible.  A css is created invisible if it's being implicitly enabled
3186  * through dependency.  An invisible css is made visible when the userland
3187  * explicitly enables it.
3188  *
3189  * Returns 0 on success, -errno on failure.  On failure, csses which have
3190  * been processed already aren't cleaned up.  The caller is responsible for
3191  * cleaning up with cgroup_apply_control_disable().
3192  */
3193 static int cgroup_apply_control_enable(struct cgroup *cgrp)
3194 {
3195         struct cgroup *dsct;
3196         struct cgroup_subsys_state *d_css;
3197         struct cgroup_subsys *ss;
3198         int ssid, ret;
3199
3200         cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3201                 for_each_subsys(ss, ssid) {
3202                         struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3203
3204                         if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
3205                                 continue;
3206
3207                         if (!css) {
3208                                 css = css_create(dsct, ss);
3209                                 if (IS_ERR(css))
3210                                         return PTR_ERR(css);
3211                         }
3212
3213                         WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3214
3215                         if (css_visible(css)) {
3216                                 ret = css_populate_dir(css);
3217                                 if (ret)
3218                                         return ret;
3219                         }
3220                 }
3221         }
3222
3223         return 0;
3224 }
3225
3226 /**
3227  * cgroup_apply_control_disable - kill or hide csses according to control
3228  * @cgrp: root of the target subtree
3229  *
3230  * Walk @cgrp's subtree and kill and hide csses so that they match
3231  * cgroup_ss_mask() and cgroup_visible_mask().
3232  *
3233  * A css is hidden when the userland requests it to be disabled while other
3234  * subsystems are still depending on it.  The css must not actively control
3235  * resources and be in the vanilla state if it's made visible again later.
3236  * Controllers which may be depended upon should provide ->css_reset() for
3237  * this purpose.
3238  */
3239 static void cgroup_apply_control_disable(struct cgroup *cgrp)
3240 {
3241         struct cgroup *dsct;
3242         struct cgroup_subsys_state *d_css;
3243         struct cgroup_subsys *ss;
3244         int ssid;
3245
3246         cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3247                 for_each_subsys(ss, ssid) {
3248                         struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3249
3250                         if (!css)
3251                                 continue;
3252
3253                         WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3254
3255                         if (css->parent &&
3256                             !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
3257                                 kill_css(css);
3258                         } else if (!css_visible(css)) {
3259                                 css_clear_dir(css);
3260                                 if (ss->css_reset)
3261                                         ss->css_reset(css);
3262                         }
3263                 }
3264         }
3265 }
3266
3267 /**
3268  * cgroup_apply_control - apply control mask updates to the subtree
3269  * @cgrp: root of the target subtree
3270  *
3271  * subsystems can be enabled and disabled in a subtree using the following
3272  * steps.
3273  *
3274  * 1. Call cgroup_save_control() to stash the current state.
3275  * 2. Update ->subtree_control masks in the subtree as desired.
3276  * 3. Call cgroup_apply_control() to apply the changes.
3277  * 4. Optionally perform other related operations.
3278  * 5. Call cgroup_finalize_control() to finish up.
3279  *
3280  * This function implements step 3 and propagates the mask changes
3281  * throughout @cgrp's subtree, updates csses accordingly and perform
3282  * process migrations.
3283  */
3284 static int cgroup_apply_control(struct cgroup *cgrp)
3285 {
3286         int ret;
3287
3288         cgroup_propagate_control(cgrp);
3289
3290         ret = cgroup_apply_control_enable(cgrp);
3291         if (ret)
3292                 return ret;
3293
3294         /*
3295          * At this point, cgroup_e_css_by_mask() results reflect the new csses
3296          * making the following cgroup_update_dfl_csses() properly update
3297          * css associations of all tasks in the subtree.
3298          */
3299         return cgroup_update_dfl_csses(cgrp);
3300 }
3301
3302 /**
3303  * cgroup_finalize_control - finalize control mask update
3304  * @cgrp: root of the target subtree
3305  * @ret: the result of the update
3306  *
3307  * Finalize control mask update.  See cgroup_apply_control() for more info.
3308  */
3309 static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
3310 {
3311         if (ret) {
3312                 cgroup_restore_control(cgrp);
3313                 cgroup_propagate_control(cgrp);
3314         }
3315
3316         cgroup_apply_control_disable(cgrp);
3317 }
3318
3319 static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
3320 {
3321         u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
3322
3323         /* if nothing is getting enabled, nothing to worry about */
3324         if (!enable)
3325                 return 0;
3326
3327         /* can @cgrp host any resources? */
3328         if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
3329                 return -EOPNOTSUPP;
3330
3331         /* mixables don't care */
3332         if (cgroup_is_mixable(cgrp))
3333                 return 0;
3334
3335         if (domain_enable) {
3336                 /* can't enable domain controllers inside a thread subtree */
3337                 if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3338                         return -EOPNOTSUPP;
3339         } else {
3340                 /*
3341                  * Threaded controllers can handle internal competitions
3342                  * and are always allowed inside a (prospective) thread
3343                  * subtree.
3344                  */
3345                 if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3346                         return 0;
3347         }
3348
3349         /*
3350          * Controllers can't be enabled for a cgroup with tasks to avoid
3351          * child cgroups competing against tasks.
3352          */
3353         if (cgroup_has_tasks(cgrp))
3354                 return -EBUSY;
3355
3356         return 0;
3357 }
3358
3359 /* change the enabled child controllers for a cgroup in the default hierarchy */
3360 static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
3361                                             char *buf, size_t nbytes,
3362                                             loff_t off)
3363 {
3364         u16 enable = 0, disable = 0;
3365         struct cgroup *cgrp, *child;
3366         struct cgroup_subsys *ss;
3367         char *tok;
3368         int ssid, ret;
3369
3370         /*
3371          * Parse input - space separated list of subsystem names prefixed
3372          * with either + or -.
3373          */
3374         buf = strstrip(buf);
3375         while ((tok = strsep(&buf, " "))) {
3376                 if (tok[0] == '\0')
3377                         continue;
3378                 do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
3379                         if (!cgroup_ssid_enabled(ssid) ||
3380                             strcmp(tok + 1, ss->name))
3381                                 continue;
3382
3383                         if (*tok == '+') {
3384                                 enable |= 1 << ssid;
3385                                 disable &= ~(1 << ssid);
3386                         } else if (*tok == '-') {
3387                                 disable |= 1 << ssid;
3388                                 enable &= ~(1 << ssid);
3389                         } else {
3390                                 return -EINVAL;
3391                         }
3392                         break;
3393                 } while_each_subsys_mask();
3394                 if (ssid == CGROUP_SUBSYS_COUNT)
3395                         return -EINVAL;
3396         }
3397
3398         cgrp = cgroup_kn_lock_live(of->kn, true);
3399         if (!cgrp)
3400                 return -ENODEV;
3401
3402         for_each_subsys(ss, ssid) {
3403                 if (enable & (1 << ssid)) {
3404                         if (cgrp->subtree_control & (1 << ssid)) {
3405                                 enable &= ~(1 << ssid);
3406                                 continue;
3407                         }
3408
3409                         if (!(cgroup_control(cgrp) & (1 << ssid))) {
3410                                 ret = -ENOENT;
3411                                 goto out_unlock;
3412                         }
3413                 } else if (disable & (1 << ssid)) {
3414                         if (!(cgrp->subtree_control & (1 << ssid))) {
3415                                 disable &= ~(1 << ssid);
3416                                 continue;
3417                         }
3418
3419                         /* a child has it enabled? */
3420                         cgroup_for_each_live_child(child, cgrp) {
3421                                 if (child->subtree_control & (1 << ssid)) {
3422                                         ret = -EBUSY;
3423                                         goto out_unlock;
3424                                 }
3425                         }
3426                 }
3427         }
3428
3429         if (!enable && !disable) {
3430                 ret = 0;
3431                 goto out_unlock;
3432         }
3433
3434         ret = cgroup_vet_subtree_control_enable(cgrp, enable);
3435         if (ret)
3436                 goto out_unlock;
3437
3438         /* save and update control masks and prepare csses */
3439         cgroup_save_control(cgrp);
3440
3441         cgrp->subtree_control |= enable;
3442         cgrp->subtree_control &= ~disable;
3443
3444         ret = cgroup_apply_control(cgrp);
3445         cgroup_finalize_control(cgrp, ret);
3446         if (ret)
3447                 goto out_unlock;
3448
3449         kernfs_activate(cgrp->kn);
3450 out_unlock:
3451         cgroup_kn_unlock(of->kn);
3452         return ret ?: nbytes;
3453 }
3454
3455 /**
3456  * cgroup_enable_threaded - make @cgrp threaded
3457  * @cgrp: the target cgroup
3458  *
3459  * Called when "threaded" is written to the cgroup.type interface file and
3460  * tries to make @cgrp threaded and join the parent's resource domain.
3461  * This function is never called on the root cgroup as cgroup.type doesn't
3462  * exist on it.
3463  */
3464 static int cgroup_enable_threaded(struct cgroup *cgrp)
3465 {
3466         struct cgroup *parent = cgroup_parent(cgrp);
3467         struct cgroup *dom_cgrp = parent->dom_cgrp;
3468         struct cgroup *dsct;
3469         struct cgroup_subsys_state *d_css;
3470         int ret;
3471
3472         lockdep_assert_held(&cgroup_mutex);
3473
3474         /* noop if already threaded */
3475         if (cgroup_is_threaded(cgrp))
3476                 return 0;
3477
3478         /*
3479          * If @cgroup is populated or has domain controllers enabled, it
3480          * can't be switched.  While the below cgroup_can_be_thread_root()
3481          * test can catch the same conditions, that's only when @parent is
3482          * not mixable, so let's check it explicitly.
3483          */
3484         if (cgroup_is_populated(cgrp) ||
3485             cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
3486                 return -EOPNOTSUPP;
3487
3488         /* we're joining the parent's domain, ensure its validity */
3489         if (!cgroup_is_valid_domain(dom_cgrp) ||
3490             !cgroup_can_be_thread_root(dom_cgrp))
3491                 return -EOPNOTSUPP;
3492
3493         /*
3494          * The following shouldn't cause actual migrations and should
3495          * always succeed.
3496          */
3497         cgroup_save_control(cgrp);
3498
3499         cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
3500                 if (dsct == cgrp || cgroup_is_threaded(dsct))
3501                         dsct->dom_cgrp = dom_cgrp;
3502
3503         ret = cgroup_apply_control(cgrp);
3504         if (!ret)
3505                 parent->nr_threaded_children++;
3506
3507         cgroup_finalize_control(cgrp, ret);
3508         return ret;
3509 }
3510
3511 static int cgroup_type_show(struct seq_file *seq, void *v)
3512 {
3513         struct cgroup *cgrp = seq_css(seq)->cgroup;
3514
3515         if (cgroup_is_threaded(cgrp))
3516                 seq_puts(seq, "threaded\n");
3517         else if (!cgroup_is_valid_domain(cgrp))
3518                 seq_puts(seq, "domain invalid\n");
3519         else if (cgroup_is_thread_root(cgrp))
3520                 seq_puts(seq, "domain threaded\n");
3521         else
3522                 seq_puts(seq, "domain\n");
3523
3524         return 0;
3525 }
3526
3527 static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
3528                                  size_t nbytes, loff_t off)
3529 {
3530         struct cgroup *cgrp;
3531         int ret;
3532
3533         /* only switching to threaded mode is supported */
3534         if (strcmp(strstrip(buf), "threaded"))
3535                 return -EINVAL;
3536
3537         /* drain dying csses before we re-apply (threaded) subtree control */
3538         cgrp = cgroup_kn_lock_live(of->kn, true);
3539         if (!cgrp)
3540                 return -ENOENT;
3541
3542         /* threaded can only be enabled */
3543         ret = cgroup_enable_threaded(cgrp);
3544
3545         cgroup_kn_unlock(of->kn);
3546         return ret ?: nbytes;
3547 }
3548
3549 static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
3550 {
3551         struct cgroup *cgrp = seq_css(seq)->cgroup;
3552         int descendants = READ_ONCE(cgrp->max_descendants);
3553
3554         if (descendants == INT_MAX)
3555                 seq_puts(seq, "max\n");
3556         else
3557                 seq_printf(seq, "%d\n", descendants);
3558
3559         return 0;
3560 }
3561
3562 static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
3563                                            char *buf, size_t nbytes, loff_t off)
3564 {
3565         struct cgroup *cgrp;
3566         int descendants;
3567         ssize_t ret;
3568
3569         buf = strstrip(buf);
3570         if (!strcmp(buf, "max")) {
3571                 descendants = INT_MAX;
3572         } else {
3573                 ret = kstrtoint(buf, 0, &descendants);
3574                 if (ret)
3575                         return ret;
3576         }
3577
3578         if (descendants < 0)
3579                 return -ERANGE;
3580
3581         cgrp = cgroup_kn_lock_live(of->kn, false);
3582         if (!cgrp)
3583                 return -ENOENT;
3584
3585         cgrp->max_descendants = descendants;
3586
3587         cgroup_kn_unlock(of->kn);
3588
3589         return nbytes;
3590 }
3591
3592 static int cgroup_max_depth_show(struct seq_file *seq, void *v)
3593 {
3594         struct cgroup *cgrp = seq_css(seq)->cgroup;
3595         int depth = READ_ONCE(cgrp->max_depth);
3596
3597         if (depth == INT_MAX)
3598                 seq_puts(seq, "max\n");
3599         else
3600                 seq_printf(seq, "%d\n", depth);
3601
3602         return 0;
3603 }
3604
3605 static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
3606                                       char *buf, size_t nbytes, loff_t off)
3607 {
3608         struct cgroup *cgrp;
3609         ssize_t ret;
3610         int depth;
3611
3612         buf = strstrip(buf);
3613         if (!strcmp(buf, "max")) {
3614                 depth = INT_MAX;
3615         } else {
3616                 ret = kstrtoint(buf, 0, &depth);
3617                 if (ret)
3618                         return ret;
3619         }
3620
3621         if (depth < 0)
3622                 return -ERANGE;
3623
3624         cgrp = cgroup_kn_lock_live(of->kn, false);
3625         if (!cgrp)
3626                 return -ENOENT;
3627
3628         cgrp->max_depth = depth;
3629
3630         cgroup_kn_unlock(of->kn);
3631
3632         return nbytes;
3633 }
3634
3635 static int cgroup_events_show(struct seq_file *seq, void *v)
3636 {
3637         struct cgroup *cgrp = seq_css(seq)->cgroup;
3638
3639         seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
3640         seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
3641
3642         return 0;
3643 }
3644
3645 static int cgroup_stat_show(struct seq_file *seq, void *v)
3646 {
3647         struct cgroup *cgroup = seq_css(seq)->cgroup;
3648
3649         seq_printf(seq, "nr_descendants %d\n",
3650                    cgroup->nr_descendants);
3651         seq_printf(seq, "nr_dying_descendants %d\n",
3652                    cgroup->nr_dying_descendants);
3653
3654         return 0;
3655 }
3656
3657 static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
3658                                                  struct cgroup *cgrp, int ssid)
3659 {
3660         struct cgroup_subsys *ss = cgroup_subsys[ssid];
3661         struct cgroup_subsys_state *css;
3662         int ret;
3663
3664         if (!ss->css_extra_stat_show)
3665                 return 0;
3666
3667         css = cgroup_tryget_css(cgrp, ss);
3668         if (!css)
3669                 return 0;
3670
3671         ret = ss->css_extra_stat_show(seq, css);
3672         css_put(css);
3673         return ret;
3674 }
3675
3676 static int cpu_stat_show(struct seq_file *seq, void *v)
3677 {
3678         struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
3679         int ret = 0;
3680
3681         cgroup_base_stat_cputime_show(seq);
3682 #ifdef CONFIG_CGROUP_SCHED
3683         ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
3684 #endif
3685         return ret;
3686 }
3687
3688 #ifdef CONFIG_PSI
3689 static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
3690 {
3691         struct cgroup *cgrp = seq_css(seq)->cgroup;
3692         struct psi_group *psi = cgroup_psi(cgrp);
3693
3694         return psi_show(seq, psi, PSI_IO);
3695 }
3696 static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
3697 {
3698         struct cgroup *cgrp = seq_css(seq)->cgroup;
3699         struct psi_group *psi = cgroup_psi(cgrp);
3700
3701         return psi_show(seq, psi, PSI_MEM);
3702 }
3703 static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
3704 {
3705         struct cgroup *cgrp = seq_css(seq)->cgroup;
3706         struct psi_group *psi = cgroup_psi(cgrp);
3707
3708         return psi_show(seq, psi, PSI_CPU);
3709 }
3710
3711 static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
3712                               size_t nbytes, enum psi_res res)
3713 {
3714         struct cgroup_file_ctx *ctx = of->priv;
3715         struct psi_trigger *new;
3716         struct cgroup *cgrp;
3717         struct psi_group *psi;
3718
3719         cgrp = cgroup_kn_lock_live(of->kn, false);
3720         if (!cgrp)
3721                 return -ENODEV;
3722
3723         cgroup_get(cgrp);
3724         cgroup_kn_unlock(of->kn);
3725
3726         /* Allow only one trigger per file descriptor */
3727         if (ctx->psi.trigger) {
3728                 cgroup_put(cgrp);
3729                 return -EBUSY;
3730         }
3731
3732         psi = cgroup_psi(cgrp);
3733         new = psi_trigger_create(psi, buf, res, of->file);
3734         if (IS_ERR(new)) {
3735                 cgroup_put(cgrp);
3736                 return PTR_ERR(new);
3737         }
3738
3739         smp_store_release(&ctx->psi.trigger, new);
3740         cgroup_put(cgrp);
3741
3742         return nbytes;
3743 }
3744
3745 static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
3746                                           char *buf, size_t nbytes,
3747                                           loff_t off)
3748 {
3749         return pressure_write(of, buf, nbytes, PSI_IO);
3750 }
3751
3752 static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
3753                                           char *buf, size_t nbytes,
3754                                           loff_t off)
3755 {
3756         return pressure_write(of, buf, nbytes, PSI_MEM);
3757 }
3758
3759 static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
3760                                           char *buf, size_t nbytes,
3761                                           loff_t off)
3762 {
3763         return pressure_write(of, buf, nbytes, PSI_CPU);
3764 }
3765
3766 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
3767 static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
3768 {
3769         struct cgroup *cgrp = seq_css(seq)->cgroup;
3770         struct psi_group *psi = cgroup_psi(cgrp);
3771
3772         return psi_show(seq, psi, PSI_IRQ);
3773 }
3774
3775 static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
3776                                          char *buf, size_t nbytes,
3777                                          loff_t off)
3778 {
3779         return pressure_write(of, buf, nbytes, PSI_IRQ);
3780 }
3781 #endif
3782
3783 static int cgroup_pressure_show(struct seq_file *seq, void *v)
3784 {
3785         struct cgroup *cgrp = seq_css(seq)->cgroup;
3786         struct psi_group *psi = cgroup_psi(cgrp);
3787
3788         seq_printf(seq, "%d\n", psi->enabled);
3789
3790         return 0;
3791 }
3792
3793 static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
3794                                      char *buf, size_t nbytes,
3795                                      loff_t off)
3796 {
3797         ssize_t ret;
3798         int enable;
3799         struct cgroup *cgrp;
3800         struct psi_group *psi;
3801
3802         ret = kstrtoint(strstrip(buf), 0, &enable);
3803         if (ret)
3804                 return ret;
3805
3806         if (enable < 0 || enable > 1)
3807                 return -ERANGE;
3808
3809         cgrp = cgroup_kn_lock_live(of->kn, false);
3810         if (!cgrp)
3811                 return -ENOENT;
3812
3813         psi = cgroup_psi(cgrp);
3814         if (psi->enabled != enable) {
3815                 int i;
3816
3817                 /* show or hide {cpu,memory,io,irq}.pressure files */
3818                 for (i = 0; i < NR_PSI_RESOURCES; i++)
3819                         cgroup_file_show(&cgrp->psi_files[i], enable);
3820
3821                 psi->enabled = enable;
3822                 if (enable)
3823                         psi_cgroup_restart(psi);
3824         }
3825
3826         cgroup_kn_unlock(of->kn);
3827
3828         return nbytes;
3829 }
3830
3831 static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
3832                                           poll_table *pt)
3833 {
3834         struct cgroup_file_ctx *ctx = of->priv;
3835
3836         return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
3837 }
3838
3839 static int cgroup_pressure_open(struct kernfs_open_file *of)
3840 {
3841         if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
3842                 return -EPERM;
3843
3844         return 0;
3845 }
3846
3847 static void cgroup_pressure_release(struct kernfs_open_file *of)
3848 {
3849         struct cgroup_file_ctx *ctx = of->priv;
3850
3851         psi_trigger_destroy(ctx->psi.trigger);
3852 }
3853
3854 bool cgroup_psi_enabled(void)
3855 {
3856         if (static_branch_likely(&psi_disabled))
3857                 return false;
3858
3859         return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
3860 }
3861
3862 #else /* CONFIG_PSI */
3863 bool cgroup_psi_enabled(void)
3864 {
3865         return false;
3866 }
3867
3868 #endif /* CONFIG_PSI */
3869
3870 static int cgroup_freeze_show(struct seq_file *seq, void *v)
3871 {
3872         struct cgroup *cgrp = seq_css(seq)->cgroup;
3873
3874         seq_printf(seq, "%d\n", cgrp->freezer.freeze);
3875
3876         return 0;
3877 }
3878
3879 static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
3880                                    char *buf, size_t nbytes, loff_t off)
3881 {
3882         struct cgroup *cgrp;
3883         ssize_t ret;
3884         int freeze;
3885
3886         ret = kstrtoint(strstrip(buf), 0, &freeze);
3887         if (ret)
3888                 return ret;
3889
3890         if (freeze < 0 || freeze > 1)
3891                 return -ERANGE;
3892
3893         cgrp = cgroup_kn_lock_live(of->kn, false);
3894         if (!cgrp)
3895                 return -ENOENT;
3896
3897         cgroup_freeze(cgrp, freeze);
3898
3899         cgroup_kn_unlock(of->kn);
3900
3901         return nbytes;
3902 }
3903
3904 static void __cgroup_kill(struct cgroup *cgrp)
3905 {
3906         struct css_task_iter it;
3907         struct task_struct *task;
3908
3909         lockdep_assert_held(&cgroup_mutex);
3910
3911         spin_lock_irq(&css_set_lock);
3912         set_bit(CGRP_KILL, &cgrp->flags);
3913         spin_unlock_irq(&css_set_lock);
3914
3915         css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
3916         while ((task = css_task_iter_next(&it))) {
3917                 /* Ignore kernel threads here. */
3918                 if (task->flags & PF_KTHREAD)
3919                         continue;
3920
3921                 /* Skip tasks that are already dying. */
3922                 if (__fatal_signal_pending(task))
3923                         continue;
3924
3925                 send_sig(SIGKILL, task, 0);
3926         }
3927         css_task_iter_end(&it);
3928
3929         spin_lock_irq(&css_set_lock);
3930         clear_bit(CGRP_KILL, &cgrp->flags);
3931         spin_unlock_irq(&css_set_lock);
3932 }
3933
3934 static void cgroup_kill(struct cgroup *cgrp)
3935 {
3936         struct cgroup_subsys_state *css;
3937         struct cgroup *dsct;
3938
3939         lockdep_assert_held(&cgroup_mutex);
3940
3941         cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
3942                 __cgroup_kill(dsct);
3943 }
3944
3945 static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
3946                                  size_t nbytes, loff_t off)
3947 {
3948         ssize_t ret = 0;
3949         int kill;
3950         struct cgroup *cgrp;
3951
3952         ret = kstrtoint(strstrip(buf), 0, &kill);
3953         if (ret)
3954                 return ret;
3955
3956         if (kill != 1)
3957                 return -ERANGE;
3958
3959         cgrp = cgroup_kn_lock_live(of->kn, false);
3960         if (!cgrp)
3961                 return -ENOENT;
3962
3963         /*
3964          * Killing is a process directed operation, i.e. the whole thread-group
3965          * is taken down so act like we do for cgroup.procs and only make this
3966          * writable in non-threaded cgroups.
3967          */
3968         if (cgroup_is_threaded(cgrp))
3969                 ret = -EOPNOTSUPP;
3970         else
3971                 cgroup_kill(cgrp);
3972
3973         cgroup_kn_unlock(of->kn);
3974
3975         return ret ?: nbytes;
3976 }
3977
3978 static int cgroup_file_open(struct kernfs_open_file *of)
3979 {
3980         struct cftype *cft = of_cft(of);
3981         struct cgroup_file_ctx *ctx;
3982         int ret;
3983
3984         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
3985         if (!ctx)
3986                 return -ENOMEM;
3987
3988         ctx->ns = current->nsproxy->cgroup_ns;
3989         get_cgroup_ns(ctx->ns);
3990         of->priv = ctx;
3991
3992         if (!cft->open)
3993                 return 0;
3994
3995         ret = cft->open(of);
3996         if (ret) {
3997                 put_cgroup_ns(ctx->ns);
3998                 kfree(ctx);
3999         }
4000         return ret;
4001 }
4002
4003 static void cgroup_file_release(struct kernfs_open_file *of)
4004 {
4005         struct cftype *cft = of_cft(of);
4006         struct cgroup_file_ctx *ctx = of->priv;
4007
4008         if (cft->release)
4009                 cft->release(of);
4010         put_cgroup_ns(ctx->ns);
4011         kfree(ctx);
4012 }
4013
4014 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
4015                                  size_t nbytes, loff_t off)
4016 {
4017         struct cgroup_file_ctx *ctx = of->priv;
4018         struct cgroup *cgrp = of->kn->parent->priv;
4019         struct cftype *cft = of_cft(of);
4020         struct cgroup_subsys_state *css;
4021         int ret;
4022
4023         if (!nbytes)
4024                 return 0;
4025
4026         /*
4027          * If namespaces are delegation boundaries, disallow writes to
4028          * files in an non-init namespace root from inside the namespace
4029          * except for the files explicitly marked delegatable -
4030          * cgroup.procs and cgroup.subtree_control.
4031          */
4032         if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
4033             !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
4034             ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
4035                 return -EPERM;
4036
4037         if (cft->write)
4038                 return cft->write(of, buf, nbytes, off);
4039
4040         /*
4041          * kernfs guarantees that a file isn't deleted with operations in
4042          * flight, which means that the matching css is and stays alive and
4043          * doesn't need to be pinned.  The RCU locking is not necessary
4044          * either.  It's just for the convenience of using cgroup_css().
4045          */
4046         rcu_read_lock();
4047         css = cgroup_css(cgrp, cft->ss);
4048         rcu_read_unlock();
4049
4050         if (cft->write_u64) {
4051                 unsigned long long v;
4052                 ret = kstrtoull(buf, 0, &v);
4053                 if (!ret)
4054                         ret = cft->write_u64(css, cft, v);
4055         } else if (cft->write_s64) {
4056                 long long v;
4057                 ret = kstrtoll(buf, 0, &v);
4058                 if (!ret)
4059                         ret = cft->write_s64(css, cft, v);
4060         } else {
4061                 ret = -EINVAL;
4062         }
4063
4064         return ret ?: nbytes;
4065 }
4066
4067 static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
4068 {
4069         struct cftype *cft = of_cft(of);
4070
4071         if (cft->poll)
4072                 return cft->poll(of, pt);
4073
4074         return kernfs_generic_poll(of, pt);
4075 }
4076
4077 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
4078 {
4079         return seq_cft(seq)->seq_start(seq, ppos);
4080 }
4081
4082 static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
4083 {
4084         return seq_cft(seq)->seq_next(seq, v, ppos);
4085 }
4086
4087 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
4088 {
4089         if (seq_cft(seq)->seq_stop)
4090                 seq_cft(seq)->seq_stop(seq, v);
4091 }
4092
4093 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
4094 {
4095         struct cftype *cft = seq_cft(m);
4096         struct cgroup_subsys_state *css = seq_css(m);
4097
4098         if (cft->seq_show)
4099                 return cft->seq_show(m, arg);
4100
4101         if (cft->read_u64)
4102                 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
4103         else if (cft->read_s64)
4104                 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
4105         else
4106                 return -EINVAL;
4107         return 0;
4108 }
4109
4110 static struct kernfs_ops cgroup_kf_single_ops = {
4111         .atomic_write_len       = PAGE_SIZE,
4112         .open                   = cgroup_file_open,
4113         .release                = cgroup_file_release,
4114         .write                  = cgroup_file_write,
4115         .poll                   = cgroup_file_poll,
4116         .seq_show               = cgroup_seqfile_show,
4117 };
4118
4119 static struct kernfs_ops cgroup_kf_ops = {
4120         .atomic_write_len       = PAGE_SIZE,
4121         .open                   = cgroup_file_open,
4122         .release                = cgroup_file_release,
4123         .write                  = cgroup_file_write,
4124         .poll                   = cgroup_file_poll,
4125         .seq_start              = cgroup_seqfile_start,
4126         .seq_next               = cgroup_seqfile_next,
4127         .seq_stop               = cgroup_seqfile_stop,
4128         .seq_show               = cgroup_seqfile_show,
4129 };
4130
4131 /* set uid and gid of cgroup dirs and files to that of the creator */
4132 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
4133 {
4134         struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
4135                                .ia_uid = current_fsuid(),
4136                                .ia_gid = current_fsgid(), };
4137
4138         if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
4139             gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
4140                 return 0;
4141
4142         return kernfs_setattr(kn, &iattr);
4143 }
4144
4145 static void cgroup_file_notify_timer(struct timer_list *timer)
4146 {
4147         cgroup_file_notify(container_of(timer, struct cgroup_file,
4148                                         notify_timer));
4149 }
4150
4151 static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
4152                            struct cftype *cft)
4153 {
4154         char name[CGROUP_FILE_NAME_MAX];
4155         struct kernfs_node *kn;
4156         struct lock_class_key *key = NULL;
4157         int ret;
4158
4159 #ifdef CONFIG_DEBUG_LOCK_ALLOC
4160         key = &cft->lockdep_key;
4161 #endif
4162         kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
4163                                   cgroup_file_mode(cft),
4164                                   GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
4165                                   0, cft->kf_ops, cft,
4166                                   NULL, key);
4167         if (IS_ERR(kn))
4168                 return PTR_ERR(kn);
4169
4170         ret = cgroup_kn_set_ugid(kn);
4171         if (ret) {
4172                 kernfs_remove(kn);
4173                 return ret;
4174         }
4175
4176         if (cft->file_offset) {
4177                 struct cgroup_file *cfile = (void *)css + cft->file_offset;
4178
4179                 timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
4180
4181                 spin_lock_irq(&cgroup_file_kn_lock);
4182                 cfile->kn = kn;
4183                 spin_unlock_irq(&cgroup_file_kn_lock);
4184         }
4185
4186         return 0;
4187 }
4188
4189 /**
4190  * cgroup_addrm_files - add or remove files to a cgroup directory
4191  * @css: the target css
4192  * @cgrp: the target cgroup (usually css->cgroup)
4193  * @cfts: array of cftypes to be added
4194  * @is_add: whether to add or remove
4195  *
4196  * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
4197  * For removals, this function never fails.
4198  */
4199 static int cgroup_addrm_files(struct cgroup_subsys_state *css,
4200                               struct cgroup *cgrp, struct cftype cfts[],
4201                               bool is_add)
4202 {
4203         struct cftype *cft, *cft_end = NULL;
4204         int ret = 0;
4205
4206         lockdep_assert_held(&cgroup_mutex);
4207
4208 restart:
4209         for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
4210                 /* does cft->flags tell us to skip this file on @cgrp? */
4211                 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
4212                         continue;
4213                 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
4214                         continue;
4215                 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
4216                         continue;
4217                 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
4218                         continue;
4219                 if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
4220                         continue;
4221                 if (is_add) {
4222                         ret = cgroup_add_file(css, cgrp, cft);
4223                         if (ret) {
4224                                 pr_warn("%s: failed to add %s, err=%d\n",
4225                                         __func__, cft->name, ret);
4226                                 cft_end = cft;
4227                                 is_add = false;
4228                                 goto restart;
4229                         }
4230                 } else {
4231                         cgroup_rm_file(cgrp, cft);
4232                 }
4233         }
4234         return ret;
4235 }
4236
4237 static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
4238 {
4239         struct cgroup_subsys *ss = cfts[0].ss;
4240         struct cgroup *root = &ss->root->cgrp;
4241         struct cgroup_subsys_state *css;
4242         int ret = 0;
4243
4244         lockdep_assert_held(&cgroup_mutex);
4245
4246         /* add/rm files for all cgroups created before */
4247         css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
4248                 struct cgroup *cgrp = css->cgroup;
4249
4250                 if (!(css->flags & CSS_VISIBLE))
4251                         continue;
4252
4253                 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
4254                 if (ret)
4255                         break;
4256         }
4257
4258         if (is_add && !ret)
4259                 kernfs_activate(root->kn);
4260         return ret;
4261 }
4262
4263 static void cgroup_exit_cftypes(struct cftype *cfts)
4264 {
4265         struct cftype *cft;
4266
4267         for (cft = cfts; cft->name[0] != '\0'; cft++) {
4268                 /* free copy for custom atomic_write_len, see init_cftypes() */
4269                 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
4270                         kfree(cft->kf_ops);
4271                 cft->kf_ops = NULL;
4272                 cft->ss = NULL;
4273
4274                 /* revert flags set by cgroup core while adding @cfts */
4275                 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL |
4276                                 __CFTYPE_ADDED);
4277         }
4278 }
4279
4280 static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4281 {
4282         struct cftype *cft;
4283         int ret = 0;
4284
4285         for (cft = cfts; cft->name[0] != '\0'; cft++) {
4286                 struct kernfs_ops *kf_ops;
4287
4288                 WARN_ON(cft->ss || cft->kf_ops);
4289
4290                 if (cft->flags & __CFTYPE_ADDED) {
4291                         ret = -EBUSY;
4292                         break;
4293                 }
4294
4295                 if (cft->seq_start)
4296                         kf_ops = &cgroup_kf_ops;
4297                 else
4298                         kf_ops = &cgroup_kf_single_ops;
4299
4300                 /*
4301                  * Ugh... if @cft wants a custom max_write_len, we need to
4302                  * make a copy of kf_ops to set its atomic_write_len.
4303                  */
4304                 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
4305                         kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
4306                         if (!kf_ops) {
4307                                 ret = -ENOMEM;
4308                                 break;
4309                         }
4310                         kf_ops->atomic_write_len = cft->max_write_len;
4311                 }
4312
4313                 cft->kf_ops = kf_ops;
4314                 cft->ss = ss;
4315                 cft->flags |= __CFTYPE_ADDED;
4316         }
4317
4318         if (ret)
4319                 cgroup_exit_cftypes(cfts);
4320         return ret;
4321 }
4322
4323 static void cgroup_rm_cftypes_locked(struct cftype *cfts)
4324 {
4325         lockdep_assert_held(&cgroup_mutex);
4326
4327         list_del(&cfts->node);
4328         cgroup_apply_cftypes(cfts, false);
4329         cgroup_exit_cftypes(cfts);
4330 }
4331
4332 /**
4333  * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
4334  * @cfts: zero-length name terminated array of cftypes
4335  *
4336  * Unregister @cfts.  Files described by @cfts are removed from all
4337  * existing cgroups and all future cgroups won't have them either.  This
4338  * function can be called anytime whether @cfts' subsys is attached or not.
4339  *
4340  * Returns 0 on successful unregistration, -ENOENT if @cfts is not
4341  * registered.
4342  */
4343 int cgroup_rm_cftypes(struct cftype *cfts)
4344 {
4345         if (!cfts || cfts[0].name[0] == '\0')
4346                 return 0;
4347
4348         if (!(cfts[0].flags & __CFTYPE_ADDED))
4349                 return -ENOENT;
4350
4351         cgroup_lock();
4352         cgroup_rm_cftypes_locked(cfts);
4353         cgroup_unlock();
4354         return 0;
4355 }
4356
4357 /**
4358  * cgroup_add_cftypes - add an array of cftypes to a subsystem
4359  * @ss: target cgroup subsystem
4360  * @cfts: zero-length name terminated array of cftypes
4361  *
4362  * Register @cfts to @ss.  Files described by @cfts are created for all
4363  * existing cgroups to which @ss is attached and all future cgroups will
4364  * have them too.  This function can be called anytime whether @ss is
4365  * attached or not.
4366  *
4367  * Returns 0 on successful registration, -errno on failure.  Note that this
4368  * function currently returns 0 as long as @cfts registration is successful
4369  * even if some file creation attempts on existing cgroups fail.
4370  */
4371 static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4372 {
4373         int ret;
4374
4375         if (!cgroup_ssid_enabled(ss->id))
4376                 return 0;
4377
4378         if (!cfts || cfts[0].name[0] == '\0')
4379                 return 0;
4380
4381         ret = cgroup_init_cftypes(ss, cfts);
4382         if (ret)
4383                 return ret;
4384
4385         cgroup_lock();
4386
4387         list_add_tail(&cfts->node, &ss->cfts);
4388         ret = cgroup_apply_cftypes(cfts, true);
4389         if (ret)
4390                 cgroup_rm_cftypes_locked(cfts);
4391
4392         cgroup_unlock();
4393         return ret;
4394 }
4395
4396 /**
4397  * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
4398  * @ss: target cgroup subsystem
4399  * @cfts: zero-length name terminated array of cftypes
4400  *
4401  * Similar to cgroup_add_cftypes() but the added files are only used for
4402  * the default hierarchy.
4403  */
4404 int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4405 {
4406         struct cftype *cft;
4407
4408         for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4409                 cft->flags |= __CFTYPE_ONLY_ON_DFL;
4410         return cgroup_add_cftypes(ss, cfts);
4411 }
4412
4413 /**
4414  * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
4415  * @ss: target cgroup subsystem
4416  * @cfts: zero-length name terminated array of cftypes
4417  *
4418  * Similar to cgroup_add_cftypes() but the added files are only used for
4419  * the legacy hierarchies.
4420  */
4421 int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4422 {
4423         struct cftype *cft;
4424
4425         for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4426                 cft->flags |= __CFTYPE_NOT_ON_DFL;
4427         return cgroup_add_cftypes(ss, cfts);
4428 }
4429
4430 /**
4431  * cgroup_file_notify - generate a file modified event for a cgroup_file
4432  * @cfile: target cgroup_file
4433  *
4434  * @cfile must have been obtained by setting cftype->file_offset.
4435  */
4436 void cgroup_file_notify(struct cgroup_file *cfile)
4437 {
4438         unsigned long flags;
4439
4440         spin_lock_irqsave(&cgroup_file_kn_lock, flags);
4441         if (cfile->kn) {
4442                 unsigned long last = cfile->notified_at;
4443                 unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
4444
4445                 if (time_in_range(jiffies, last, next)) {
4446                         timer_reduce(&cfile->notify_timer, next);
4447                 } else {
4448                         kernfs_notify(cfile->kn);
4449                         cfile->notified_at = jiffies;
4450                 }
4451         }
4452         spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
4453 }
4454
4455 /**
4456  * cgroup_file_show - show or hide a hidden cgroup file
4457  * @cfile: target cgroup_file obtained by setting cftype->file_offset
4458  * @show: whether to show or hide
4459  */
4460 void cgroup_file_show(struct cgroup_file *cfile, bool show)
4461 {
4462         struct kernfs_node *kn;
4463
4464         spin_lock_irq(&cgroup_file_kn_lock);
4465         kn = cfile->kn;
4466         kernfs_get(kn);
4467         spin_unlock_irq(&cgroup_file_kn_lock);
4468
4469         if (kn)
4470                 kernfs_show(kn, show);
4471
4472         kernfs_put(kn);
4473 }
4474
4475 /**
4476  * css_next_child - find the next child of a given css
4477  * @pos: the current position (%NULL to initiate traversal)
4478  * @parent: css whose children to walk
4479  *
4480  * This function returns the next child of @parent and should be called
4481  * under either cgroup_mutex or RCU read lock.  The only requirement is
4482  * that @parent and @pos are accessible.  The next sibling is guaranteed to
4483  * be returned regardless of their states.
4484  *
4485  * If a subsystem synchronizes ->css_online() and the start of iteration, a
4486  * css which finished ->css_online() is guaranteed to be visible in the
4487  * future iterations and will stay visible until the last reference is put.
4488  * A css which hasn't finished ->css_online() or already finished
4489  * ->css_offline() may show up during traversal.  It's each subsystem's
4490  * responsibility to synchronize against on/offlining.
4491  */
4492 struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
4493                                            struct cgroup_subsys_state *parent)
4494 {
4495         struct cgroup_subsys_state *next;
4496
4497         cgroup_assert_mutex_or_rcu_locked();
4498
4499         /*
4500          * @pos could already have been unlinked from the sibling list.
4501          * Once a cgroup is removed, its ->sibling.next is no longer
4502          * updated when its next sibling changes.  CSS_RELEASED is set when
4503          * @pos is taken off list, at which time its next pointer is valid,
4504          * and, as releases are serialized, the one pointed to by the next
4505          * pointer is guaranteed to not have started release yet.  This
4506          * implies that if we observe !CSS_RELEASED on @pos in this RCU
4507          * critical section, the one pointed to by its next pointer is
4508          * guaranteed to not have finished its RCU grace period even if we
4509          * have dropped rcu_read_lock() in-between iterations.
4510          *
4511          * If @pos has CSS_RELEASED set, its next pointer can't be
4512          * dereferenced; however, as each css is given a monotonically
4513          * increasing unique serial number and always appended to the
4514          * sibling list, the next one can be found by walking the parent's
4515          * children until the first css with higher serial number than
4516          * @pos's.  While this path can be slower, it happens iff iteration
4517          * races against release and the race window is very small.
4518          */
4519         if (!pos) {
4520                 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
4521         } else if (likely(!(pos->flags & CSS_RELEASED))) {
4522                 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
4523         } else {
4524                 list_for_each_entry_rcu(next, &parent->children, sibling,
4525                                         lockdep_is_held(&cgroup_mutex))
4526                         if (next->serial_nr > pos->serial_nr)
4527                                 break;
4528         }
4529
4530         /*
4531          * @next, if not pointing to the head, can be dereferenced and is
4532          * the next sibling.
4533          */
4534         if (&next->sibling != &parent->children)
4535                 return next;
4536         return NULL;
4537 }
4538
4539 /**
4540  * css_next_descendant_pre - find the next descendant for pre-order walk
4541  * @pos: the current position (%NULL to initiate traversal)
4542  * @root: css whose descendants to walk
4543  *
4544  * To be used by css_for_each_descendant_pre().  Find the next descendant
4545  * to visit for pre-order traversal of @root's descendants.  @root is
4546  * included in the iteration and the first node to be visited.
4547  *
4548  * While this function requires cgroup_mutex or RCU read locking, it
4549  * doesn't require the whole traversal to be contained in a single critical
4550  * section.  This function will return the correct next descendant as long
4551  * as both @pos and @root are accessible and @pos is a descendant of @root.
4552  *
4553  * If a subsystem synchronizes ->css_online() and the start of iteration, a
4554  * css which finished ->css_online() is guaranteed to be visible in the
4555  * future iterations and will stay visible until the last reference is put.
4556  * A css which hasn't finished ->css_online() or already finished
4557  * ->css_offline() may show up during traversal.  It's each subsystem's
4558  * responsibility to synchronize against on/offlining.
4559  */
4560 struct cgroup_subsys_state *
4561 css_next_descendant_pre(struct cgroup_subsys_state *pos,
4562                         struct cgroup_subsys_state *root)
4563 {
4564         struct cgroup_subsys_state *next;
4565
4566         cgroup_assert_mutex_or_rcu_locked();
4567
4568         /* if first iteration, visit @root */
4569         if (!pos)
4570                 return root;
4571
4572         /* visit the first child if exists */
4573         next = css_next_child(NULL, pos);
4574         if (next)
4575                 return next;
4576
4577         /* no child, visit my or the closest ancestor's next sibling */
4578         while (pos != root) {
4579                 next = css_next_child(pos, pos->parent);
4580                 if (next)
4581                         return next;
4582                 pos = pos->parent;
4583         }
4584
4585         return NULL;
4586 }
4587 EXPORT_SYMBOL_GPL(css_next_descendant_pre);
4588
4589 /**
4590  * css_rightmost_descendant - return the rightmost descendant of a css
4591  * @pos: css of interest
4592  *
4593  * Return the rightmost descendant of @pos.  If there's no descendant, @pos
4594  * is returned.  This can be used during pre-order traversal to skip
4595  * subtree of @pos.
4596  *
4597  * While this function requires cgroup_mutex or RCU read locking, it
4598  * doesn't require the whole traversal to be contained in a single critical
4599  * section.  This function will return the correct rightmost descendant as
4600  * long as @pos is accessible.
4601  */
4602 struct cgroup_subsys_state *
4603 css_rightmost_descendant(struct cgroup_subsys_state *pos)
4604 {
4605         struct cgroup_subsys_state *last, *tmp;
4606
4607         cgroup_assert_mutex_or_rcu_locked();
4608
4609         do {
4610                 last = pos;
4611                 /* ->prev isn't RCU safe, walk ->next till the end */
4612                 pos = NULL;
4613                 css_for_each_child(tmp, last)
4614                         pos = tmp;
4615         } while (pos);
4616
4617         return last;
4618 }
4619
4620 static struct cgroup_subsys_state *
4621 css_leftmost_descendant(struct cgroup_subsys_state *pos)
4622 {
4623         struct cgroup_subsys_state *last;
4624
4625         do {
4626                 last = pos;
4627                 pos = css_next_child(NULL, pos);
4628         } while (pos);
4629
4630         return last;
4631 }
4632
4633 /**
4634  * css_next_descendant_post - find the next descendant for post-order walk
4635  * @pos: the current position (%NULL to initiate traversal)
4636  * @root: css whose descendants to walk
4637  *
4638  * To be used by css_for_each_descendant_post().  Find the next descendant
4639  * to visit for post-order traversal of @root's descendants.  @root is
4640  * included in the iteration and the last node to be visited.
4641  *
4642  * While this function requires cgroup_mutex or RCU read locking, it
4643  * doesn't require the whole traversal to be contained in a single critical
4644  * section.  This function will return the correct next descendant as long
4645  * as both @pos and @cgroup are accessible and @pos is a descendant of
4646  * @cgroup.
4647  *
4648  * If a subsystem synchronizes ->css_online() and the start of iteration, a
4649  * css which finished ->css_online() is guaranteed to be visible in the
4650  * future iterations and will stay visible until the last reference is put.
4651  * A css which hasn't finished ->css_online() or already finished
4652  * ->css_offline() may show up during traversal.  It's each subsystem's
4653  * responsibility to synchronize against on/offlining.
4654  */
4655 struct cgroup_subsys_state *
4656 css_next_descendant_post(struct cgroup_subsys_state *pos,
4657                          struct cgroup_subsys_state *root)
4658 {
4659         struct cgroup_subsys_state *next;
4660
4661         cgroup_assert_mutex_or_rcu_locked();
4662
4663         /* if first iteration, visit leftmost descendant which may be @root */
4664         if (!pos)
4665                 return css_leftmost_descendant(root);
4666
4667         /* if we visited @root, we're done */
4668         if (pos == root)
4669                 return NULL;
4670
4671         /* if there's an unvisited sibling, visit its leftmost descendant */
4672         next = css_next_child(pos, pos->parent);
4673         if (next)
4674                 return css_leftmost_descendant(next);
4675
4676         /* no sibling left, visit parent */
4677         return pos->parent;
4678 }
4679
4680 /**
4681  * css_has_online_children - does a css have online children
4682  * @css: the target css
4683  *
4684  * Returns %true if @css has any online children; otherwise, %false.  This
4685  * function can be called from any context but the caller is responsible
4686  * for synchronizing against on/offlining as necessary.
4687  */
4688 bool css_has_online_children(struct cgroup_subsys_state *css)
4689 {
4690         struct cgroup_subsys_state *child;
4691         bool ret = false;
4692
4693         rcu_read_lock();
4694         css_for_each_child(child, css) {
4695                 if (child->flags & CSS_ONLINE) {
4696                         ret = true;
4697                         break;
4698                 }
4699         }
4700         rcu_read_unlock();
4701         return ret;
4702 }
4703
4704 static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
4705 {
4706         struct list_head *l;
4707         struct cgrp_cset_link *link;
4708         struct css_set *cset;
4709
4710         lockdep_assert_held(&css_set_lock);
4711
4712         /* find the next threaded cset */
4713         if (it->tcset_pos) {
4714                 l = it->tcset_pos->next;
4715
4716                 if (l != it->tcset_head) {
4717                         it->tcset_pos = l;
4718                         return container_of(l, struct css_set,
4719                                             threaded_csets_node);
4720                 }
4721
4722                 it->tcset_pos = NULL;
4723         }
4724
4725         /* find the next cset */
4726         l = it->cset_pos;
4727         l = l->next;
4728         if (l == it->cset_head) {
4729                 it->cset_pos = NULL;
4730                 return NULL;
4731         }
4732
4733         if (it->ss) {
4734                 cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
4735         } else {
4736                 link = list_entry(l, struct cgrp_cset_link, cset_link);
4737                 cset = link->cset;
4738         }
4739
4740         it->cset_pos = l;
4741
4742         /* initialize threaded css_set walking */
4743         if (it->flags & CSS_TASK_ITER_THREADED) {
4744                 if (it->cur_dcset)
4745                         put_css_set_locked(it->cur_dcset);
4746                 it->cur_dcset = cset;
4747                 get_css_set(cset);
4748
4749                 it->tcset_head = &cset->threaded_csets;
4750                 it->tcset_pos = &cset->threaded_csets;
4751         }
4752
4753         return cset;
4754 }
4755
4756 /**
4757  * css_task_iter_advance_css_set - advance a task iterator to the next css_set
4758  * @it: the iterator to advance
4759  *
4760  * Advance @it to the next css_set to walk.
4761  */
4762 static void css_task_iter_advance_css_set(struct css_task_iter *it)
4763 {
4764         struct css_set *cset;
4765
4766         lockdep_assert_held(&css_set_lock);
4767
4768         /* Advance to the next non-empty css_set and find first non-empty tasks list*/
4769         while ((cset = css_task_iter_next_css_set(it))) {
4770                 if (!list_empty(&cset->tasks)) {
4771                         it->cur_tasks_head = &cset->tasks;
4772                         break;
4773                 } else if (!list_empty(&cset->mg_tasks)) {
4774                         it->cur_tasks_head = &cset->mg_tasks;
4775                         break;
4776                 } else if (!list_empty(&cset->dying_tasks)) {
4777                         it->cur_tasks_head = &cset->dying_tasks;
4778                         break;
4779                 }
4780         }
4781         if (!cset) {
4782                 it->task_pos = NULL;
4783                 return;
4784         }
4785         it->task_pos = it->cur_tasks_head->next;
4786
4787         /*
4788          * We don't keep css_sets locked across iteration steps and thus
4789          * need to take steps to ensure that iteration can be resumed after
4790          * the lock is re-acquired.  Iteration is performed at two levels -
4791          * css_sets and tasks in them.
4792          *
4793          * Once created, a css_set never leaves its cgroup lists, so a
4794          * pinned css_set is guaranteed to stay put and we can resume
4795          * iteration afterwards.
4796          *
4797          * Tasks may leave @cset across iteration steps.  This is resolved
4798          * by registering each iterator with the css_set currently being
4799          * walked and making css_set_move_task() advance iterators whose
4800          * next task is leaving.
4801          */
4802         if (it->cur_cset) {
4803                 list_del(&it->iters_node);
4804                 put_css_set_locked(it->cur_cset);
4805         }
4806         get_css_set(cset);
4807         it->cur_cset = cset;
4808         list_add(&it->iters_node, &cset->task_iters);
4809 }
4810
4811 static void css_task_iter_skip(struct css_task_iter *it,
4812                                struct task_struct *task)
4813 {
4814         lockdep_assert_held(&css_set_lock);
4815
4816         if (it->task_pos == &task->cg_list) {
4817                 it->task_pos = it->task_pos->next;
4818                 it->flags |= CSS_TASK_ITER_SKIPPED;
4819         }
4820 }
4821
4822 static void css_task_iter_advance(struct css_task_iter *it)
4823 {
4824         struct task_struct *task;
4825
4826         lockdep_assert_held(&css_set_lock);
4827 repeat:
4828         if (it->task_pos) {
4829                 /*
4830                  * Advance iterator to find next entry. We go through cset
4831                  * tasks, mg_tasks and dying_tasks, when consumed we move onto
4832                  * the next cset.
4833                  */
4834                 if (it->flags & CSS_TASK_ITER_SKIPPED)
4835                         it->flags &= ~CSS_TASK_ITER_SKIPPED;
4836                 else
4837                         it->task_pos = it->task_pos->next;
4838
4839                 if (it->task_pos == &it->cur_cset->tasks) {
4840                         it->cur_tasks_head = &it->cur_cset->mg_tasks;
4841                         it->task_pos = it->cur_tasks_head->next;
4842                 }
4843                 if (it->task_pos == &it->cur_cset->mg_tasks) {
4844                         it->cur_tasks_head = &it->cur_cset->dying_tasks;
4845                         it->task_pos = it->cur_tasks_head->next;
4846                 }
4847                 if (it->task_pos == &it->cur_cset->dying_tasks)
4848                         css_task_iter_advance_css_set(it);
4849         } else {
4850                 /* called from start, proceed to the first cset */
4851                 css_task_iter_advance_css_set(it);
4852         }
4853
4854         if (!it->task_pos)
4855                 return;
4856
4857         task = list_entry(it->task_pos, struct task_struct, cg_list);
4858
4859         if (it->flags & CSS_TASK_ITER_PROCS) {
4860                 /* if PROCS, skip over tasks which aren't group leaders */
4861                 if (!thread_group_leader(task))
4862                         goto repeat;
4863
4864                 /* and dying leaders w/o live member threads */
4865                 if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
4866                     !atomic_read(&task->signal->live))
4867                         goto repeat;
4868         } else {
4869                 /* skip all dying ones */
4870                 if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
4871                         goto repeat;
4872         }
4873 }
4874
4875 /**
4876  * css_task_iter_start - initiate task iteration
4877  * @css: the css to walk tasks of
4878  * @flags: CSS_TASK_ITER_* flags
4879  * @it: the task iterator to use
4880  *
4881  * Initiate iteration through the tasks of @css.  The caller can call
4882  * css_task_iter_next() to walk through the tasks until the function
4883  * returns NULL.  On completion of iteration, css_task_iter_end() must be
4884  * called.
4885  */
4886 void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
4887                          struct css_task_iter *it)
4888 {
4889         memset(it, 0, sizeof(*it));
4890
4891         spin_lock_irq(&css_set_lock);
4892
4893         it->ss = css->ss;
4894         it->flags = flags;
4895
4896         if (CGROUP_HAS_SUBSYS_CONFIG && it->ss)
4897                 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
4898         else
4899                 it->cset_pos = &css->cgroup->cset_links;
4900
4901         it->cset_head = it->cset_pos;
4902
4903         css_task_iter_advance(it);
4904
4905         spin_unlock_irq(&css_set_lock);
4906 }
4907
4908 /**
4909  * css_task_iter_next - return the next task for the iterator
4910  * @it: the task iterator being iterated
4911  *
4912  * The "next" function for task iteration.  @it should have been
4913  * initialized via css_task_iter_start().  Returns NULL when the iteration
4914  * reaches the end.
4915  */
4916 struct task_struct *css_task_iter_next(struct css_task_iter *it)
4917 {
4918         if (it->cur_task) {
4919                 put_task_struct(it->cur_task);
4920                 it->cur_task = NULL;
4921         }
4922
4923         spin_lock_irq(&css_set_lock);
4924
4925         /* @it may be half-advanced by skips, finish advancing */
4926         if (it->flags & CSS_TASK_ITER_SKIPPED)
4927                 css_task_iter_advance(it);
4928
4929         if (it->task_pos) {
4930                 it->cur_task = list_entry(it->task_pos, struct task_struct,
4931                                           cg_list);
4932                 get_task_struct(it->cur_task);
4933                 css_task_iter_advance(it);
4934         }
4935
4936         spin_unlock_irq(&css_set_lock);
4937
4938         return it->cur_task;
4939 }
4940
4941 /**
4942  * css_task_iter_end - finish task iteration
4943  * @it: the task iterator to finish
4944  *
4945  * Finish task iteration started by css_task_iter_start().
4946  */
4947 void css_task_iter_end(struct css_task_iter *it)
4948 {
4949         if (it->cur_cset) {
4950                 spin_lock_irq(&css_set_lock);
4951                 list_del(&it->iters_node);
4952                 put_css_set_locked(it->cur_cset);
4953                 spin_unlock_irq(&css_set_lock);
4954         }
4955
4956         if (it->cur_dcset)
4957                 put_css_set(it->cur_dcset);
4958
4959         if (it->cur_task)
4960                 put_task_struct(it->cur_task);
4961 }
4962
4963 static void cgroup_procs_release(struct kernfs_open_file *of)
4964 {
4965         struct cgroup_file_ctx *ctx = of->priv;
4966
4967         if (ctx->procs.started)
4968                 css_task_iter_end(&ctx->procs.iter);
4969 }
4970
4971 static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
4972 {
4973         struct kernfs_open_file *of = s->private;
4974         struct cgroup_file_ctx *ctx = of->priv;
4975
4976         if (pos)
4977                 (*pos)++;
4978
4979         return css_task_iter_next(&ctx->procs.iter);
4980 }
4981
4982 static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
4983                                   unsigned int iter_flags)
4984 {
4985         struct kernfs_open_file *of = s->private;
4986         struct cgroup *cgrp = seq_css(s)->cgroup;
4987         struct cgroup_file_ctx *ctx = of->priv;
4988         struct css_task_iter *it = &ctx->procs.iter;
4989
4990         /*
4991          * When a seq_file is seeked, it's always traversed sequentially
4992          * from position 0, so we can simply keep iterating on !0 *pos.
4993          */
4994         if (!ctx->procs.started) {
4995                 if (WARN_ON_ONCE((*pos)))
4996                         return ERR_PTR(-EINVAL);
4997                 css_task_iter_start(&cgrp->self, iter_flags, it);
4998                 ctx->procs.started = true;
4999         } else if (!(*pos)) {
5000                 css_task_iter_end(it);
5001                 css_task_iter_start(&cgrp->self, iter_flags, it);
5002         } else
5003                 return it->cur_task;
5004
5005         return cgroup_procs_next(s, NULL, NULL);
5006 }
5007
5008 static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
5009 {
5010         struct cgroup *cgrp = seq_css(s)->cgroup;
5011
5012         /*
5013          * All processes of a threaded subtree belong to the domain cgroup
5014          * of the subtree.  Only threads can be distributed across the
5015          * subtree.  Reject reads on cgroup.procs in the subtree proper.
5016          * They're always empty anyway.
5017          */
5018         if (cgroup_is_threaded(cgrp))
5019                 return ERR_PTR(-EOPNOTSUPP);
5020
5021         return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
5022                                             CSS_TASK_ITER_THREADED);
5023 }
5024
5025 static int cgroup_procs_show(struct seq_file *s, void *v)
5026 {
5027         seq_printf(s, "%d\n", task_pid_vnr(v));
5028         return 0;
5029 }
5030
5031 static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
5032 {
5033         int ret;
5034         struct inode *inode;
5035
5036         lockdep_assert_held(&cgroup_mutex);
5037
5038         inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
5039         if (!inode)
5040                 return -ENOMEM;
5041
5042         ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE);
5043         iput(inode);
5044         return ret;
5045 }
5046
5047 static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
5048                                          struct cgroup *dst_cgrp,
5049                                          struct super_block *sb,
5050                                          struct cgroup_namespace *ns)
5051 {
5052         struct cgroup *com_cgrp = src_cgrp;
5053         int ret;
5054
5055         lockdep_assert_held(&cgroup_mutex);
5056
5057         /* find the common ancestor */
5058         while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
5059                 com_cgrp = cgroup_parent(com_cgrp);
5060
5061         /* %current should be authorized to migrate to the common ancestor */
5062         ret = cgroup_may_write(com_cgrp, sb);
5063         if (ret)
5064                 return ret;
5065
5066         /*
5067          * If namespaces are delegation boundaries, %current must be able
5068          * to see both source and destination cgroups from its namespace.
5069          */
5070         if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
5071             (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
5072              !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
5073                 return -ENOENT;
5074
5075         return 0;
5076 }
5077
5078 static int cgroup_attach_permissions(struct cgroup *src_cgrp,
5079                                      struct cgroup *dst_cgrp,
5080                                      struct super_block *sb, bool threadgroup,
5081                                      struct cgroup_namespace *ns)
5082 {
5083         int ret = 0;
5084
5085         ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
5086         if (ret)
5087                 return ret;
5088
5089         ret = cgroup_migrate_vet_dst(dst_cgrp);
5090         if (ret)
5091                 return ret;
5092
5093         if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
5094                 ret = -EOPNOTSUPP;
5095
5096         return ret;
5097 }
5098
5099 static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
5100                                     bool threadgroup)
5101 {
5102         struct cgroup_file_ctx *ctx = of->priv;
5103         struct cgroup *src_cgrp, *dst_cgrp;
5104         struct task_struct *task;
5105         const struct cred *saved_cred;
5106         ssize_t ret;
5107         bool threadgroup_locked;
5108
5109         dst_cgrp = cgroup_kn_lock_live(of->kn, false);
5110         if (!dst_cgrp)
5111                 return -ENODEV;
5112
5113         task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
5114         ret = PTR_ERR_OR_ZERO(task);
5115         if (ret)
5116                 goto out_unlock;
5117
5118         /* find the source cgroup */
5119         spin_lock_irq(&css_set_lock);
5120         src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
5121         spin_unlock_irq(&css_set_lock);
5122
5123         /*
5124          * Process and thread migrations follow same delegation rule. Check
5125          * permissions using the credentials from file open to protect against
5126          * inherited fd attacks.
5127          */
5128         saved_cred = override_creds(of->file->f_cred);
5129         ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
5130                                         of->file->f_path.dentry->d_sb,
5131                                         threadgroup, ctx->ns);
5132         revert_creds(saved_cred);
5133         if (ret)
5134                 goto out_finish;
5135
5136         ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
5137
5138 out_finish:
5139         cgroup_procs_write_finish(task, threadgroup_locked);
5140 out_unlock:
5141         cgroup_kn_unlock(of->kn);
5142
5143         return ret;
5144 }
5145
5146 static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
5147                                   char *buf, size_t nbytes, loff_t off)
5148 {
5149         return __cgroup_procs_write(of, buf, true) ?: nbytes;
5150 }
5151
5152 static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
5153 {
5154         return __cgroup_procs_start(s, pos, 0);
5155 }
5156
5157 static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
5158                                     char *buf, size_t nbytes, loff_t off)
5159 {
5160         return __cgroup_procs_write(of, buf, false) ?: nbytes;
5161 }
5162
5163 /* cgroup core interface files for the default hierarchy */
5164 static struct cftype cgroup_base_files[] = {
5165         {
5166                 .name = "cgroup.type",
5167                 .flags = CFTYPE_NOT_ON_ROOT,
5168                 .seq_show = cgroup_type_show,
5169                 .write = cgroup_type_write,
5170         },
5171         {
5172                 .name = "cgroup.procs",
5173                 .flags = CFTYPE_NS_DELEGATABLE,
5174                 .file_offset = offsetof(struct cgroup, procs_file),
5175                 .release = cgroup_procs_release,
5176                 .seq_start = cgroup_procs_start,
5177                 .seq_next = cgroup_procs_next,
5178                 .seq_show = cgroup_procs_show,
5179                 .write = cgroup_procs_write,
5180         },
5181         {
5182                 .name = "cgroup.threads",
5183                 .flags = CFTYPE_NS_DELEGATABLE,
5184                 .release = cgroup_procs_release,
5185                 .seq_start = cgroup_threads_start,
5186                 .seq_next = cgroup_procs_next,
5187                 .seq_show = cgroup_procs_show,
5188                 .write = cgroup_threads_write,
5189         },
5190         {
5191                 .name = "cgroup.controllers",
5192                 .seq_show = cgroup_controllers_show,
5193         },
5194         {
5195                 .name = "cgroup.subtree_control",
5196                 .flags = CFTYPE_NS_DELEGATABLE,
5197                 .seq_show = cgroup_subtree_control_show,
5198                 .write = cgroup_subtree_control_write,
5199         },
5200         {
5201                 .name = "cgroup.events",
5202                 .flags = CFTYPE_NOT_ON_ROOT,
5203                 .file_offset = offsetof(struct cgroup, events_file),
5204                 .seq_show = cgroup_events_show,
5205         },
5206         {
5207                 .name = "cgroup.max.descendants",
5208                 .seq_show = cgroup_max_descendants_show,
5209                 .write = cgroup_max_descendants_write,
5210         },
5211         {
5212                 .name = "cgroup.max.depth",
5213                 .seq_show = cgroup_max_depth_show,
5214                 .write = cgroup_max_depth_write,
5215         },
5216         {
5217                 .name = "cgroup.stat",
5218                 .seq_show = cgroup_stat_show,
5219         },
5220         {
5221                 .name = "cgroup.freeze",
5222                 .flags = CFTYPE_NOT_ON_ROOT,
5223                 .seq_show = cgroup_freeze_show,
5224                 .write = cgroup_freeze_write,
5225         },
5226         {
5227                 .name = "cgroup.kill",
5228                 .flags = CFTYPE_NOT_ON_ROOT,
5229                 .write = cgroup_kill_write,
5230         },
5231         {
5232                 .name = "cpu.stat",
5233                 .seq_show = cpu_stat_show,
5234         },
5235         { }     /* terminate */
5236 };
5237
5238 static struct cftype cgroup_psi_files[] = {
5239 #ifdef CONFIG_PSI
5240         {
5241                 .name = "io.pressure",
5242                 .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
5243                 .open = cgroup_pressure_open,
5244                 .seq_show = cgroup_io_pressure_show,
5245                 .write = cgroup_io_pressure_write,
5246                 .poll = cgroup_pressure_poll,
5247                 .release = cgroup_pressure_release,
5248         },
5249         {
5250                 .name = "memory.pressure",
5251                 .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
5252                 .open = cgroup_pressure_open,
5253                 .seq_show = cgroup_memory_pressure_show,
5254                 .write = cgroup_memory_pressure_write,
5255                 .poll = cgroup_pressure_poll,
5256                 .release = cgroup_pressure_release,
5257         },
5258         {
5259                 .name = "cpu.pressure",
5260                 .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
5261                 .open = cgroup_pressure_open,
5262                 .seq_show = cgroup_cpu_pressure_show,
5263                 .write = cgroup_cpu_pressure_write,
5264                 .poll = cgroup_pressure_poll,
5265                 .release = cgroup_pressure_release,
5266         },
5267 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
5268         {
5269                 .name = "irq.pressure",
5270                 .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
5271                 .open = cgroup_pressure_open,
5272                 .seq_show = cgroup_irq_pressure_show,
5273                 .write = cgroup_irq_pressure_write,
5274                 .poll = cgroup_pressure_poll,
5275                 .release = cgroup_pressure_release,
5276         },
5277 #endif
5278         {
5279                 .name = "cgroup.pressure",
5280                 .seq_show = cgroup_pressure_show,
5281                 .write = cgroup_pressure_write,
5282         },
5283 #endif /* CONFIG_PSI */
5284         { }     /* terminate */
5285 };
5286
5287 /*
5288  * css destruction is four-stage process.
5289  *
5290  * 1. Destruction starts.  Killing of the percpu_ref is initiated.
5291  *    Implemented in kill_css().
5292  *
5293  * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
5294  *    and thus css_tryget_online() is guaranteed to fail, the css can be
5295  *    offlined by invoking offline_css().  After offlining, the base ref is
5296  *    put.  Implemented in css_killed_work_fn().
5297  *
5298  * 3. When the percpu_ref reaches zero, the only possible remaining
5299  *    accessors are inside RCU read sections.  css_release() schedules the
5300  *    RCU callback.
5301  *
5302  * 4. After the grace period, the css can be freed.  Implemented in
5303  *    css_free_work_fn().
5304  *
5305  * It is actually hairier because both step 2 and 4 require process context
5306  * and thus involve punting to css->destroy_work adding two additional
5307  * steps to the already complex sequence.
5308  */
5309 static void css_free_rwork_fn(struct work_struct *work)
5310 {
5311         struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
5312                                 struct cgroup_subsys_state, destroy_rwork);
5313         struct cgroup_subsys *ss = css->ss;
5314         struct cgroup *cgrp = css->cgroup;
5315
5316         percpu_ref_exit(&css->refcnt);
5317
5318         if (ss) {
5319                 /* css free path */
5320                 struct cgroup_subsys_state *parent = css->parent;
5321                 int id = css->id;
5322
5323                 ss->css_free(css);
5324                 cgroup_idr_remove(&ss->css_idr, id);
5325                 cgroup_put(cgrp);
5326
5327                 if (parent)
5328                         css_put(parent);
5329         } else {
5330                 /* cgroup free path */
5331                 atomic_dec(&cgrp->root->nr_cgrps);
5332                 cgroup1_pidlist_destroy_all(cgrp);
5333                 cancel_work_sync(&cgrp->release_agent_work);
5334                 bpf_cgrp_storage_free(cgrp);
5335
5336                 if (cgroup_parent(cgrp)) {
5337                         /*
5338                          * We get a ref to the parent, and put the ref when
5339                          * this cgroup is being freed, so it's guaranteed
5340                          * that the parent won't be destroyed before its
5341                          * children.
5342                          */
5343                         cgroup_put(cgroup_parent(cgrp));
5344                         kernfs_put(cgrp->kn);
5345                         psi_cgroup_free(cgrp);
5346                         cgroup_rstat_exit(cgrp);
5347                         kfree(cgrp);
5348                 } else {
5349                         /*
5350                          * This is root cgroup's refcnt reaching zero,
5351                          * which indicates that the root should be
5352                          * released.
5353                          */
5354                         cgroup_destroy_root(cgrp->root);
5355                 }
5356         }
5357 }
5358
5359 static void css_release_work_fn(struct work_struct *work)
5360 {
5361         struct cgroup_subsys_state *css =
5362                 container_of(work, struct cgroup_subsys_state, destroy_work);
5363         struct cgroup_subsys *ss = css->ss;
5364         struct cgroup *cgrp = css->cgroup;
5365
5366         cgroup_lock();
5367
5368         css->flags |= CSS_RELEASED;
5369         list_del_rcu(&css->sibling);
5370
5371         if (ss) {
5372                 /* css release path */
5373                 if (!list_empty(&css->rstat_css_node)) {
5374                         cgroup_rstat_flush(cgrp);
5375                         list_del_rcu(&css->rstat_css_node);
5376                 }
5377
5378                 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
5379                 if (ss->css_released)
5380                         ss->css_released(css);
5381         } else {
5382                 struct cgroup *tcgrp;
5383
5384                 /* cgroup release path */
5385                 TRACE_CGROUP_PATH(release, cgrp);
5386
5387                 cgroup_rstat_flush(cgrp);
5388
5389                 spin_lock_irq(&css_set_lock);
5390                 for (tcgrp = cgroup_parent(cgrp); tcgrp;
5391                      tcgrp = cgroup_parent(tcgrp))
5392                         tcgrp->nr_dying_descendants--;
5393                 spin_unlock_irq(&css_set_lock);
5394
5395                 /*
5396                  * There are two control paths which try to determine
5397                  * cgroup from dentry without going through kernfs -
5398                  * cgroupstats_build() and css_tryget_online_from_dir().
5399                  * Those are supported by RCU protecting clearing of
5400                  * cgrp->kn->priv backpointer.
5401                  */
5402                 if (cgrp->kn)
5403                         RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
5404                                          NULL);
5405         }
5406
5407         cgroup_unlock();
5408
5409         INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5410         queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5411 }
5412
5413 static void css_release(struct percpu_ref *ref)
5414 {
5415         struct cgroup_subsys_state *css =
5416                 container_of(ref, struct cgroup_subsys_state, refcnt);
5417
5418         INIT_WORK(&css->destroy_work, css_release_work_fn);
5419         queue_work(cgroup_destroy_wq, &css->destroy_work);
5420 }
5421
5422 static void init_and_link_css(struct cgroup_subsys_state *css,
5423                               struct cgroup_subsys *ss, struct cgroup *cgrp)
5424 {
5425         lockdep_assert_held(&cgroup_mutex);
5426
5427         cgroup_get_live(cgrp);
5428
5429         memset(css, 0, sizeof(*css));
5430         css->cgroup = cgrp;
5431         css->ss = ss;
5432         css->id = -1;
5433         INIT_LIST_HEAD(&css->sibling);
5434         INIT_LIST_HEAD(&css->children);
5435         INIT_LIST_HEAD(&css->rstat_css_node);
5436         css->serial_nr = css_serial_nr_next++;
5437         atomic_set(&css->online_cnt, 0);
5438
5439         if (cgroup_parent(cgrp)) {
5440                 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
5441                 css_get(css->parent);
5442         }
5443
5444         if (ss->css_rstat_flush)
5445                 list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
5446
5447         BUG_ON(cgroup_css(cgrp, ss));
5448 }
5449
5450 /* invoke ->css_online() on a new CSS and mark it online if successful */
5451 static int online_css(struct cgroup_subsys_state *css)
5452 {
5453         struct cgroup_subsys *ss = css->ss;
5454         int ret = 0;
5455
5456         lockdep_assert_held(&cgroup_mutex);
5457
5458         if (ss->css_online)
5459                 ret = ss->css_online(css);
5460         if (!ret) {
5461                 css->flags |= CSS_ONLINE;
5462                 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
5463
5464                 atomic_inc(&css->online_cnt);
5465                 if (css->parent)
5466                         atomic_inc(&css->parent->online_cnt);
5467         }
5468         return ret;
5469 }
5470
5471 /* if the CSS is online, invoke ->css_offline() on it and mark it offline */
5472 static void offline_css(struct cgroup_subsys_state *css)
5473 {
5474         struct cgroup_subsys *ss = css->ss;
5475
5476         lockdep_assert_held(&cgroup_mutex);
5477
5478         if (!(css->flags & CSS_ONLINE))
5479                 return;
5480
5481         if (ss->css_offline)
5482                 ss->css_offline(css);
5483
5484         css->flags &= ~CSS_ONLINE;
5485         RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
5486
5487         wake_up_all(&css->cgroup->offline_waitq);
5488 }
5489
5490 /**
5491  * css_create - create a cgroup_subsys_state
5492  * @cgrp: the cgroup new css will be associated with
5493  * @ss: the subsys of new css
5494  *
5495  * Create a new css associated with @cgrp - @ss pair.  On success, the new
5496  * css is online and installed in @cgrp.  This function doesn't create the
5497  * interface files.  Returns 0 on success, -errno on failure.
5498  */
5499 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
5500                                               struct cgroup_subsys *ss)
5501 {
5502         struct cgroup *parent = cgroup_parent(cgrp);
5503         struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
5504         struct cgroup_subsys_state *css;
5505         int err;
5506
5507         lockdep_assert_held(&cgroup_mutex);
5508
5509         css = ss->css_alloc(parent_css);
5510         if (!css)
5511                 css = ERR_PTR(-ENOMEM);
5512         if (IS_ERR(css))
5513                 return css;
5514
5515         init_and_link_css(css, ss, cgrp);
5516
5517         err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
5518         if (err)
5519                 goto err_free_css;
5520
5521         err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
5522         if (err < 0)
5523                 goto err_free_css;
5524         css->id = err;
5525
5526         /* @css is ready to be brought online now, make it visible */
5527         list_add_tail_rcu(&css->sibling, &parent_css->children);
5528         cgroup_idr_replace(&ss->css_idr, css, css->id);
5529
5530         err = online_css(css);
5531         if (err)
5532                 goto err_list_del;
5533
5534         return css;
5535
5536 err_list_del:
5537         list_del_rcu(&css->sibling);
5538 err_free_css:
5539         list_del_rcu(&css->rstat_css_node);
5540         INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5541         queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5542         return ERR_PTR(err);
5543 }
5544
5545 /*
5546  * The returned cgroup is fully initialized including its control mask, but
5547  * it isn't associated with its kernfs_node and doesn't have the control
5548  * mask applied.
5549  */
5550 static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
5551                                     umode_t mode)
5552 {
5553         struct cgroup_root *root = parent->root;
5554         struct cgroup *cgrp, *tcgrp;
5555         struct kernfs_node *kn;
5556         int level = parent->level + 1;
5557         int ret;
5558
5559         /* allocate the cgroup and its ID, 0 is reserved for the root */
5560         cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
5561         if (!cgrp)
5562                 return ERR_PTR(-ENOMEM);
5563
5564         ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
5565         if (ret)
5566                 goto out_free_cgrp;
5567
5568         ret = cgroup_rstat_init(cgrp);
5569         if (ret)
5570                 goto out_cancel_ref;
5571
5572         /* create the directory */
5573         kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5574         if (IS_ERR(kn)) {
5575                 ret = PTR_ERR(kn);
5576                 goto out_stat_exit;
5577         }
5578         cgrp->kn = kn;
5579
5580         init_cgroup_housekeeping(cgrp);
5581
5582         cgrp->self.parent = &parent->self;
5583         cgrp->root = root;
5584         cgrp->level = level;
5585
5586         ret = psi_cgroup_alloc(cgrp);
5587         if (ret)
5588                 goto out_kernfs_remove;
5589
5590         ret = cgroup_bpf_inherit(cgrp);
5591         if (ret)
5592                 goto out_psi_free;
5593
5594         /*
5595          * New cgroup inherits effective freeze counter, and
5596          * if the parent has to be frozen, the child has too.
5597          */
5598         cgrp->freezer.e_freeze = parent->freezer.e_freeze;
5599         if (cgrp->freezer.e_freeze) {
5600                 /*
5601                  * Set the CGRP_FREEZE flag, so when a process will be
5602                  * attached to the child cgroup, it will become frozen.
5603                  * At this point the new cgroup is unpopulated, so we can
5604                  * consider it frozen immediately.
5605                  */
5606                 set_bit(CGRP_FREEZE, &cgrp->flags);
5607                 set_bit(CGRP_FROZEN, &cgrp->flags);
5608         }
5609
5610         spin_lock_irq(&css_set_lock);
5611         for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5612                 cgrp->ancestors[tcgrp->level] = tcgrp;
5613
5614                 if (tcgrp != cgrp) {
5615                         tcgrp->nr_descendants++;
5616
5617                         /*
5618                          * If the new cgroup is frozen, all ancestor cgroups
5619                          * get a new frozen descendant, but their state can't
5620                          * change because of this.
5621                          */
5622                         if (cgrp->freezer.e_freeze)
5623                                 tcgrp->freezer.nr_frozen_descendants++;
5624                 }
5625         }
5626         spin_unlock_irq(&css_set_lock);
5627
5628         if (notify_on_release(parent))
5629                 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
5630
5631         if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
5632                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
5633
5634         cgrp->self.serial_nr = css_serial_nr_next++;
5635
5636         /* allocation complete, commit to creation */
5637         list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
5638         atomic_inc(&root->nr_cgrps);
5639         cgroup_get_live(parent);
5640
5641         /*
5642          * On the default hierarchy, a child doesn't automatically inherit
5643          * subtree_control from the parent.  Each is configured manually.
5644          */
5645         if (!cgroup_on_dfl(cgrp))
5646                 cgrp->subtree_control = cgroup_control(cgrp);
5647
5648         cgroup_propagate_control(cgrp);
5649
5650         return cgrp;
5651
5652 out_psi_free:
5653         psi_cgroup_free(cgrp);
5654 out_kernfs_remove:
5655         kernfs_remove(cgrp->kn);
5656 out_stat_exit:
5657         cgroup_rstat_exit(cgrp);
5658 out_cancel_ref:
5659         percpu_ref_exit(&cgrp->self.refcnt);
5660 out_free_cgrp:
5661         kfree(cgrp);
5662         return ERR_PTR(ret);
5663 }
5664
5665 static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
5666 {
5667         struct cgroup *cgroup;
5668         int ret = false;
5669         int level = 1;
5670
5671         lockdep_assert_held(&cgroup_mutex);
5672
5673         for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
5674                 if (cgroup->nr_descendants >= cgroup->max_descendants)
5675                         goto fail;
5676
5677                 if (level > cgroup->max_depth)
5678                         goto fail;
5679
5680                 level++;
5681         }
5682
5683         ret = true;
5684 fail:
5685         return ret;
5686 }
5687
5688 int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
5689 {
5690         struct cgroup *parent, *cgrp;
5691         int ret;
5692
5693         /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
5694         if (strchr(name, '\n'))
5695                 return -EINVAL;
5696
5697         parent = cgroup_kn_lock_live(parent_kn, false);
5698         if (!parent)
5699                 return -ENODEV;
5700
5701         if (!cgroup_check_hierarchy_limits(parent)) {
5702                 ret = -EAGAIN;
5703                 goto out_unlock;
5704         }
5705
5706         cgrp = cgroup_create(parent, name, mode);
5707         if (IS_ERR(cgrp)) {
5708                 ret = PTR_ERR(cgrp);
5709                 goto out_unlock;
5710         }
5711
5712         /*
5713          * This extra ref will be put in cgroup_free_fn() and guarantees
5714          * that @cgrp->kn is always accessible.
5715          */
5716         kernfs_get(cgrp->kn);
5717
5718         ret = cgroup_kn_set_ugid(cgrp->kn);
5719         if (ret)
5720                 goto out_destroy;
5721
5722         ret = css_populate_dir(&cgrp->self);
5723         if (ret)
5724                 goto out_destroy;
5725
5726         ret = cgroup_apply_control_enable(cgrp);
5727         if (ret)
5728                 goto out_destroy;
5729
5730         TRACE_CGROUP_PATH(mkdir, cgrp);
5731
5732         /* let's create and online css's */
5733         kernfs_activate(cgrp->kn);
5734
5735         ret = 0;
5736         goto out_unlock;
5737
5738 out_destroy:
5739         cgroup_destroy_locked(cgrp);
5740 out_unlock:
5741         cgroup_kn_unlock(parent_kn);
5742         return ret;
5743 }
5744
5745 /*
5746  * This is called when the refcnt of a css is confirmed to be killed.
5747  * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
5748  * initiate destruction and put the css ref from kill_css().
5749  */
5750 static void css_killed_work_fn(struct work_struct *work)
5751 {
5752         struct cgroup_subsys_state *css =
5753                 container_of(work, struct cgroup_subsys_state, destroy_work);
5754
5755         cgroup_lock();
5756
5757         do {
5758                 offline_css(css);
5759                 css_put(css);
5760                 /* @css can't go away while we're holding cgroup_mutex */
5761                 css = css->parent;
5762         } while (css && atomic_dec_and_test(&css->online_cnt));
5763
5764         cgroup_unlock();
5765 }
5766
5767 /* css kill confirmation processing requires process context, bounce */
5768 static void css_killed_ref_fn(struct percpu_ref *ref)
5769 {
5770         struct cgroup_subsys_state *css =
5771                 container_of(ref, struct cgroup_subsys_state, refcnt);
5772
5773         if (atomic_dec_and_test(&css->online_cnt)) {
5774                 INIT_WORK(&css->destroy_work, css_killed_work_fn);
5775                 queue_work(cgroup_destroy_wq, &css->destroy_work);
5776         }
5777 }
5778
5779 /**
5780  * kill_css - destroy a css
5781  * @css: css to destroy
5782  *
5783  * This function initiates destruction of @css by removing cgroup interface
5784  * files and putting its base reference.  ->css_offline() will be invoked
5785  * asynchronously once css_tryget_online() is guaranteed to fail and when
5786  * the reference count reaches zero, @css will be released.
5787  */
5788 static void kill_css(struct cgroup_subsys_state *css)
5789 {
5790         lockdep_assert_held(&cgroup_mutex);
5791
5792         if (css->flags & CSS_DYING)
5793                 return;
5794
5795         css->flags |= CSS_DYING;
5796
5797         /*
5798          * This must happen before css is disassociated with its cgroup.
5799          * See seq_css() for details.
5800          */
5801         css_clear_dir(css);
5802
5803         /*
5804          * Killing would put the base ref, but we need to keep it alive
5805          * until after ->css_offline().
5806          */
5807         css_get(css);
5808
5809         /*
5810          * cgroup core guarantees that, by the time ->css_offline() is
5811          * invoked, no new css reference will be given out via
5812          * css_tryget_online().  We can't simply call percpu_ref_kill() and
5813          * proceed to offlining css's because percpu_ref_kill() doesn't
5814          * guarantee that the ref is seen as killed on all CPUs on return.
5815          *
5816          * Use percpu_ref_kill_and_confirm() to get notifications as each
5817          * css is confirmed to be seen as killed on all CPUs.
5818          */
5819         percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
5820 }
5821
5822 /**
5823  * cgroup_destroy_locked - the first stage of cgroup destruction
5824  * @cgrp: cgroup to be destroyed
5825  *
5826  * css's make use of percpu refcnts whose killing latency shouldn't be
5827  * exposed to userland and are RCU protected.  Also, cgroup core needs to
5828  * guarantee that css_tryget_online() won't succeed by the time
5829  * ->css_offline() is invoked.  To satisfy all the requirements,
5830  * destruction is implemented in the following two steps.
5831  *
5832  * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
5833  *     userland visible parts and start killing the percpu refcnts of
5834  *     css's.  Set up so that the next stage will be kicked off once all
5835  *     the percpu refcnts are confirmed to be killed.
5836  *
5837  * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
5838  *     rest of destruction.  Once all cgroup references are gone, the
5839  *     cgroup is RCU-freed.
5840  *
5841  * This function implements s1.  After this step, @cgrp is gone as far as
5842  * the userland is concerned and a new cgroup with the same name may be
5843  * created.  As cgroup doesn't care about the names internally, this
5844  * doesn't cause any problem.
5845  */
5846 static int cgroup_destroy_locked(struct cgroup *cgrp)
5847         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
5848 {
5849         struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
5850         struct cgroup_subsys_state *css;
5851         struct cgrp_cset_link *link;
5852         int ssid;
5853
5854         lockdep_assert_held(&cgroup_mutex);
5855
5856         /*
5857          * Only migration can raise populated from zero and we're already
5858          * holding cgroup_mutex.
5859          */
5860         if (cgroup_is_populated(cgrp))
5861                 return -EBUSY;
5862
5863         /*
5864          * Make sure there's no live children.  We can't test emptiness of
5865          * ->self.children as dead children linger on it while being
5866          * drained; otherwise, "rmdir parent/child parent" may fail.
5867          */
5868         if (css_has_online_children(&cgrp->self))
5869                 return -EBUSY;
5870
5871         /*
5872          * Mark @cgrp and the associated csets dead.  The former prevents
5873          * further task migration and child creation by disabling
5874          * cgroup_lock_live_group().  The latter makes the csets ignored by
5875          * the migration path.
5876          */
5877         cgrp->self.flags &= ~CSS_ONLINE;
5878
5879         spin_lock_irq(&css_set_lock);
5880         list_for_each_entry(link, &cgrp->cset_links, cset_link)
5881                 link->cset->dead = true;
5882         spin_unlock_irq(&css_set_lock);
5883
5884         /* initiate massacre of all css's */
5885         for_each_css(css, ssid, cgrp)
5886                 kill_css(css);
5887
5888         /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
5889         css_clear_dir(&cgrp->self);
5890         kernfs_remove(cgrp->kn);
5891
5892         if (cgroup_is_threaded(cgrp))
5893                 parent->nr_threaded_children--;
5894
5895         spin_lock_irq(&css_set_lock);
5896         for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5897                 tcgrp->nr_descendants--;
5898                 tcgrp->nr_dying_descendants++;
5899                 /*
5900                  * If the dying cgroup is frozen, decrease frozen descendants
5901                  * counters of ancestor cgroups.
5902                  */
5903                 if (test_bit(CGRP_FROZEN, &cgrp->flags))
5904                         tcgrp->freezer.nr_frozen_descendants--;
5905         }
5906         spin_unlock_irq(&css_set_lock);
5907
5908         cgroup1_check_for_release(parent);
5909
5910         cgroup_bpf_offline(cgrp);
5911
5912         /* put the base reference */
5913         percpu_ref_kill(&cgrp->self.refcnt);
5914
5915         return 0;
5916 };
5917
5918 int cgroup_rmdir(struct kernfs_node *kn)
5919 {
5920         struct cgroup *cgrp;
5921         int ret = 0;
5922
5923         cgrp = cgroup_kn_lock_live(kn, false);
5924         if (!cgrp)
5925                 return 0;
5926
5927         ret = cgroup_destroy_locked(cgrp);
5928         if (!ret)
5929                 TRACE_CGROUP_PATH(rmdir, cgrp);
5930
5931         cgroup_kn_unlock(kn);
5932         return ret;
5933 }
5934
5935 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
5936         .show_options           = cgroup_show_options,
5937         .mkdir                  = cgroup_mkdir,
5938         .rmdir                  = cgroup_rmdir,
5939         .show_path              = cgroup_show_path,
5940 };
5941
5942 static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5943 {
5944         struct cgroup_subsys_state *css;
5945
5946         pr_debug("Initializing cgroup subsys %s\n", ss->name);
5947
5948         cgroup_lock();
5949
5950         idr_init(&ss->css_idr);
5951         INIT_LIST_HEAD(&ss->cfts);
5952
5953         /* Create the root cgroup state for this subsystem */
5954         ss->root = &cgrp_dfl_root;
5955         css = ss->css_alloc(NULL);
5956         /* We don't handle early failures gracefully */
5957         BUG_ON(IS_ERR(css));
5958         init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
5959
5960         /*
5961          * Root csses are never destroyed and we can't initialize
5962          * percpu_ref during early init.  Disable refcnting.
5963          */
5964         css->flags |= CSS_NO_REF;
5965
5966         if (early) {
5967                 /* allocation can't be done safely during early init */
5968                 css->id = 1;
5969         } else {
5970                 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
5971                 BUG_ON(css->id < 0);
5972         }
5973
5974         /* Update the init_css_set to contain a subsys
5975          * pointer to this state - since the subsystem is
5976          * newly registered, all tasks and hence the
5977          * init_css_set is in the subsystem's root cgroup. */
5978         init_css_set.subsys[ss->id] = css;
5979
5980         have_fork_callback |= (bool)ss->fork << ss->id;
5981         have_exit_callback |= (bool)ss->exit << ss->id;
5982         have_release_callback |= (bool)ss->release << ss->id;
5983         have_canfork_callback |= (bool)ss->can_fork << ss->id;
5984
5985         /* At system boot, before all subsystems have been
5986          * registered, no tasks have been forked, so we don't
5987          * need to invoke fork callbacks here. */
5988         BUG_ON(!list_empty(&init_task.tasks));
5989
5990         BUG_ON(online_css(css));
5991
5992         cgroup_unlock();
5993 }
5994
5995 /**
5996  * cgroup_init_early - cgroup initialization at system boot
5997  *
5998  * Initialize cgroups at system boot, and initialize any
5999  * subsystems that request early init.
6000  */
6001 int __init cgroup_init_early(void)
6002 {
6003         static struct cgroup_fs_context __initdata ctx;
6004         struct cgroup_subsys *ss;
6005         int i;
6006
6007         ctx.root = &cgrp_dfl_root;
6008         init_cgroup_root(&ctx);
6009         cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
6010
6011         RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
6012
6013         for_each_subsys(ss, i) {
6014                 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
6015                      "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
6016                      i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
6017                      ss->id, ss->name);
6018                 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
6019                      "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
6020
6021                 ss->id = i;
6022                 ss->name = cgroup_subsys_name[i];
6023                 if (!ss->legacy_name)
6024                         ss->legacy_name = cgroup_subsys_name[i];
6025
6026                 if (ss->early_init)
6027                         cgroup_init_subsys(ss, true);
6028         }
6029         return 0;
6030 }
6031
6032 /**
6033  * cgroup_init - cgroup initialization
6034  *
6035  * Register cgroup filesystem and /proc file, and initialize
6036  * any subsystems that didn't request early init.
6037  */
6038 int __init cgroup_init(void)
6039 {
6040         struct cgroup_subsys *ss;
6041         int ssid;
6042
6043         BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
6044         BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
6045         BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
6046         BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
6047
6048         cgroup_rstat_boot();
6049
6050         get_user_ns(init_cgroup_ns.user_ns);
6051
6052         cgroup_lock();
6053
6054         /*
6055          * Add init_css_set to the hash table so that dfl_root can link to
6056          * it during init.
6057          */
6058         hash_add(css_set_table, &init_css_set.hlist,
6059                  css_set_hash(init_css_set.subsys));
6060
6061         BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
6062
6063         cgroup_unlock();
6064
6065         for_each_subsys(ss, ssid) {
6066                 if (ss->early_init) {
6067                         struct cgroup_subsys_state *css =
6068                                 init_css_set.subsys[ss->id];
6069
6070                         css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
6071                                                    GFP_KERNEL);
6072                         BUG_ON(css->id < 0);
6073                 } else {
6074                         cgroup_init_subsys(ss, false);
6075                 }
6076
6077                 list_add_tail(&init_css_set.e_cset_node[ssid],
6078                               &cgrp_dfl_root.cgrp.e_csets[ssid]);
6079
6080                 /*
6081                  * Setting dfl_root subsys_mask needs to consider the
6082                  * disabled flag and cftype registration needs kmalloc,
6083                  * both of which aren't available during early_init.
6084                  */
6085                 if (!cgroup_ssid_enabled(ssid))
6086                         continue;
6087
6088                 if (cgroup1_ssid_disabled(ssid))
6089                         printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
6090                                ss->name);
6091
6092                 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
6093
6094                 /* implicit controllers must be threaded too */
6095                 WARN_ON(ss->implicit_on_dfl && !ss->threaded);
6096
6097                 if (ss->implicit_on_dfl)
6098                         cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
6099                 else if (!ss->dfl_cftypes)
6100                         cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
6101
6102                 if (ss->threaded)
6103                         cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
6104
6105                 if (ss->dfl_cftypes == ss->legacy_cftypes) {
6106                         WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
6107                 } else {
6108                         WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
6109                         WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
6110                 }
6111
6112                 if (ss->bind)
6113                         ss->bind(init_css_set.subsys[ssid]);
6114
6115                 cgroup_lock();
6116                 css_populate_dir(init_css_set.subsys[ssid]);
6117                 cgroup_unlock();
6118         }
6119
6120         /* init_css_set.subsys[] has been updated, re-hash */
6121         hash_del(&init_css_set.hlist);
6122         hash_add(css_set_table, &init_css_set.hlist,
6123                  css_set_hash(init_css_set.subsys));
6124
6125         WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
6126         WARN_ON(register_filesystem(&cgroup_fs_type));
6127         WARN_ON(register_filesystem(&cgroup2_fs_type));
6128         WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
6129 #ifdef CONFIG_CPUSETS
6130         WARN_ON(register_filesystem(&cpuset_fs_type));
6131 #endif
6132
6133         return 0;
6134 }
6135
6136 static int __init cgroup_wq_init(void)
6137 {
6138         /*
6139          * There isn't much point in executing destruction path in
6140          * parallel.  Good chunk is serialized with cgroup_mutex anyway.
6141          * Use 1 for @max_active.
6142          *
6143          * We would prefer to do this in cgroup_init() above, but that
6144          * is called before init_workqueues(): so leave this until after.
6145          */
6146         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
6147         BUG_ON(!cgroup_destroy_wq);
6148         return 0;
6149 }
6150 core_initcall(cgroup_wq_init);
6151
6152 void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
6153 {
6154         struct kernfs_node *kn;
6155
6156         kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
6157         if (!kn)
6158                 return;
6159         kernfs_path(kn, buf, buflen);
6160         kernfs_put(kn);
6161 }
6162
6163 /*
6164  * cgroup_get_from_id : get the cgroup associated with cgroup id
6165  * @id: cgroup id
6166  * On success return the cgrp or ERR_PTR on failure
6167  * Only cgroups within current task's cgroup NS are valid.
6168  */
6169 struct cgroup *cgroup_get_from_id(u64 id)
6170 {
6171         struct kernfs_node *kn;
6172         struct cgroup *cgrp, *root_cgrp;
6173
6174         kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
6175         if (!kn)
6176                 return ERR_PTR(-ENOENT);
6177
6178         if (kernfs_type(kn) != KERNFS_DIR) {
6179                 kernfs_put(kn);
6180                 return ERR_PTR(-ENOENT);
6181         }
6182
6183         rcu_read_lock();
6184
6185         cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
6186         if (cgrp && !cgroup_tryget(cgrp))
6187                 cgrp = NULL;
6188
6189         rcu_read_unlock();
6190         kernfs_put(kn);
6191
6192         if (!cgrp)
6193                 return ERR_PTR(-ENOENT);
6194
6195         root_cgrp = current_cgns_cgroup_dfl();
6196         if (!cgroup_is_descendant(cgrp, root_cgrp)) {
6197                 cgroup_put(cgrp);
6198                 return ERR_PTR(-ENOENT);
6199         }
6200
6201         return cgrp;
6202 }
6203 EXPORT_SYMBOL_GPL(cgroup_get_from_id);
6204
6205 /*
6206  * proc_cgroup_show()
6207  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
6208  *  - Used for /proc/<pid>/cgroup.
6209  */
6210 int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
6211                      struct pid *pid, struct task_struct *tsk)
6212 {
6213         char *buf;
6214         int retval;
6215         struct cgroup_root *root;
6216
6217         retval = -ENOMEM;
6218         buf = kmalloc(PATH_MAX, GFP_KERNEL);
6219         if (!buf)
6220                 goto out;
6221
6222         cgroup_lock();
6223         spin_lock_irq(&css_set_lock);
6224
6225         for_each_root(root) {
6226                 struct cgroup_subsys *ss;
6227                 struct cgroup *cgrp;
6228                 int ssid, count = 0;
6229
6230                 if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
6231                         continue;
6232
6233                 seq_printf(m, "%d:", root->hierarchy_id);
6234                 if (root != &cgrp_dfl_root)
6235                         for_each_subsys(ss, ssid)
6236                                 if (root->subsys_mask & (1 << ssid))
6237                                         seq_printf(m, "%s%s", count++ ? "," : "",
6238                                                    ss->legacy_name);
6239                 if (strlen(root->name))
6240                         seq_printf(m, "%sname=%s", count ? "," : "",
6241                                    root->name);
6242                 seq_putc(m, ':');
6243
6244                 cgrp = task_cgroup_from_root(tsk, root);
6245
6246                 /*
6247                  * On traditional hierarchies, all zombie tasks show up as
6248                  * belonging to the root cgroup.  On the default hierarchy,
6249                  * while a zombie doesn't show up in "cgroup.procs" and
6250                  * thus can't be migrated, its /proc/PID/cgroup keeps
6251                  * reporting the cgroup it belonged to before exiting.  If
6252                  * the cgroup is removed before the zombie is reaped,
6253                  * " (deleted)" is appended to the cgroup path.
6254                  */
6255                 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
6256                         retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
6257                                                 current->nsproxy->cgroup_ns);
6258                         if (retval >= PATH_MAX)
6259                                 retval = -ENAMETOOLONG;
6260                         if (retval < 0)
6261                                 goto out_unlock;
6262
6263                         seq_puts(m, buf);
6264                 } else {
6265                         seq_puts(m, "/");
6266                 }
6267
6268                 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
6269                         seq_puts(m, " (deleted)\n");
6270                 else
6271                         seq_putc(m, '\n');
6272         }
6273
6274         retval = 0;
6275 out_unlock:
6276         spin_unlock_irq(&css_set_lock);
6277         cgroup_unlock();
6278         kfree(buf);
6279 out:
6280         return retval;
6281 }
6282
6283 /**
6284  * cgroup_fork - initialize cgroup related fields during copy_process()
6285  * @child: pointer to task_struct of forking parent process.
6286  *
6287  * A task is associated with the init_css_set until cgroup_post_fork()
6288  * attaches it to the target css_set.
6289  */
6290 void cgroup_fork(struct task_struct *child)
6291 {
6292         RCU_INIT_POINTER(child->cgroups, &init_css_set);
6293         INIT_LIST_HEAD(&child->cg_list);
6294 }
6295
6296 /**
6297  * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer
6298  * @f: file corresponding to cgroup_dir
6299  *
6300  * Find the cgroup from a file pointer associated with a cgroup directory.
6301  * Returns a pointer to the cgroup on success. ERR_PTR is returned if the
6302  * cgroup cannot be found.
6303  */
6304 static struct cgroup *cgroup_v1v2_get_from_file(struct file *f)
6305 {
6306         struct cgroup_subsys_state *css;
6307
6308         css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
6309         if (IS_ERR(css))
6310                 return ERR_CAST(css);
6311
6312         return css->cgroup;
6313 }
6314
6315 /**
6316  * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports
6317  * cgroup2.
6318  * @f: file corresponding to cgroup2_dir
6319  */
6320 static struct cgroup *cgroup_get_from_file(struct file *f)
6321 {
6322         struct cgroup *cgrp = cgroup_v1v2_get_from_file(f);
6323
6324         if (IS_ERR(cgrp))
6325                 return ERR_CAST(cgrp);
6326
6327         if (!cgroup_on_dfl(cgrp)) {
6328                 cgroup_put(cgrp);
6329                 return ERR_PTR(-EBADF);
6330         }
6331
6332         return cgrp;
6333 }
6334
6335 /**
6336  * cgroup_css_set_fork - find or create a css_set for a child process
6337  * @kargs: the arguments passed to create the child process
6338  *
6339  * This functions finds or creates a new css_set which the child
6340  * process will be attached to in cgroup_post_fork(). By default,
6341  * the child process will be given the same css_set as its parent.
6342  *
6343  * If CLONE_INTO_CGROUP is specified this function will try to find an
6344  * existing css_set which includes the requested cgroup and if not create
6345  * a new css_set that the child will be attached to later. If this function
6346  * succeeds it will hold cgroup_threadgroup_rwsem on return. If
6347  * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
6348  * before grabbing cgroup_threadgroup_rwsem and will hold a reference
6349  * to the target cgroup.
6350  */
6351 static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
6352         __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
6353 {
6354         int ret;
6355         struct cgroup *dst_cgrp = NULL;
6356         struct css_set *cset;
6357         struct super_block *sb;
6358         struct file *f;
6359
6360         if (kargs->flags & CLONE_INTO_CGROUP)
6361                 cgroup_lock();
6362
6363         cgroup_threadgroup_change_begin(current);
6364
6365         spin_lock_irq(&css_set_lock);
6366         cset = task_css_set(current);
6367         get_css_set(cset);
6368         spin_unlock_irq(&css_set_lock);
6369
6370         if (!(kargs->flags & CLONE_INTO_CGROUP)) {
6371                 kargs->cset = cset;
6372                 return 0;
6373         }
6374
6375         f = fget_raw(kargs->cgroup);
6376         if (!f) {
6377                 ret = -EBADF;
6378                 goto err;
6379         }
6380         sb = f->f_path.dentry->d_sb;
6381
6382         dst_cgrp = cgroup_get_from_file(f);
6383         if (IS_ERR(dst_cgrp)) {
6384                 ret = PTR_ERR(dst_cgrp);
6385                 dst_cgrp = NULL;
6386                 goto err;
6387         }
6388
6389         if (cgroup_is_dead(dst_cgrp)) {
6390                 ret = -ENODEV;
6391                 goto err;
6392         }
6393
6394         /*
6395          * Verify that we the target cgroup is writable for us. This is
6396          * usually done by the vfs layer but since we're not going through
6397          * the vfs layer here we need to do it "manually".
6398          */
6399         ret = cgroup_may_write(dst_cgrp, sb);
6400         if (ret)
6401                 goto err;
6402
6403         /*
6404          * Spawning a task directly into a cgroup works by passing a file
6405          * descriptor to the target cgroup directory. This can even be an O_PATH
6406          * file descriptor. But it can never be a cgroup.procs file descriptor.
6407          * This was done on purpose so spawning into a cgroup could be
6408          * conceptualized as an atomic
6409          *
6410          *   fd = openat(dfd_cgroup, "cgroup.procs", ...);
6411          *   write(fd, <child-pid>, ...);
6412          *
6413          * sequence, i.e. it's a shorthand for the caller opening and writing
6414          * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
6415          * to always use the caller's credentials.
6416          */
6417         ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
6418                                         !(kargs->flags & CLONE_THREAD),
6419                                         current->nsproxy->cgroup_ns);
6420         if (ret)
6421                 goto err;
6422
6423         kargs->cset = find_css_set(cset, dst_cgrp);
6424         if (!kargs->cset) {
6425                 ret = -ENOMEM;
6426                 goto err;
6427         }
6428
6429         put_css_set(cset);
6430         fput(f);
6431         kargs->cgrp = dst_cgrp;
6432         return ret;
6433
6434 err:
6435         cgroup_threadgroup_change_end(current);
6436         cgroup_unlock();
6437         if (f)
6438                 fput(f);
6439         if (dst_cgrp)
6440                 cgroup_put(dst_cgrp);
6441         put_css_set(cset);
6442         if (kargs->cset)
6443                 put_css_set(kargs->cset);
6444         return ret;
6445 }
6446
6447 /**
6448  * cgroup_css_set_put_fork - drop references we took during fork
6449  * @kargs: the arguments passed to create the child process
6450  *
6451  * Drop references to the prepared css_set and target cgroup if
6452  * CLONE_INTO_CGROUP was requested.
6453  */
6454 static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
6455         __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6456 {
6457         struct cgroup *cgrp = kargs->cgrp;
6458         struct css_set *cset = kargs->cset;
6459
6460         cgroup_threadgroup_change_end(current);
6461
6462         if (cset) {
6463                 put_css_set(cset);
6464                 kargs->cset = NULL;
6465         }
6466
6467         if (kargs->flags & CLONE_INTO_CGROUP) {
6468                 cgroup_unlock();
6469                 if (cgrp) {
6470                         cgroup_put(cgrp);
6471                         kargs->cgrp = NULL;
6472                 }
6473         }
6474 }
6475
6476 /**
6477  * cgroup_can_fork - called on a new task before the process is exposed
6478  * @child: the child process
6479  * @kargs: the arguments passed to create the child process
6480  *
6481  * This prepares a new css_set for the child process which the child will
6482  * be attached to in cgroup_post_fork().
6483  * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
6484  * callback returns an error, the fork aborts with that error code. This
6485  * allows for a cgroup subsystem to conditionally allow or deny new forks.
6486  */
6487 int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
6488 {
6489         struct cgroup_subsys *ss;
6490         int i, j, ret;
6491
6492         ret = cgroup_css_set_fork(kargs);
6493         if (ret)
6494                 return ret;
6495
6496         do_each_subsys_mask(ss, i, have_canfork_callback) {
6497                 ret = ss->can_fork(child, kargs->cset);
6498                 if (ret)
6499                         goto out_revert;
6500         } while_each_subsys_mask();
6501
6502         return 0;
6503
6504 out_revert:
6505         for_each_subsys(ss, j) {
6506                 if (j >= i)
6507                         break;
6508                 if (ss->cancel_fork)
6509                         ss->cancel_fork(child, kargs->cset);
6510         }
6511
6512         cgroup_css_set_put_fork(kargs);
6513
6514         return ret;
6515 }
6516
6517 /**
6518  * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
6519  * @child: the child process
6520  * @kargs: the arguments passed to create the child process
6521  *
6522  * This calls the cancel_fork() callbacks if a fork failed *after*
6523  * cgroup_can_fork() succeeded and cleans up references we took to
6524  * prepare a new css_set for the child process in cgroup_can_fork().
6525  */
6526 void cgroup_cancel_fork(struct task_struct *child,
6527                         struct kernel_clone_args *kargs)
6528 {
6529         struct cgroup_subsys *ss;
6530         int i;
6531
6532         for_each_subsys(ss, i)
6533                 if (ss->cancel_fork)
6534                         ss->cancel_fork(child, kargs->cset);
6535
6536         cgroup_css_set_put_fork(kargs);
6537 }
6538
6539 /**
6540  * cgroup_post_fork - finalize cgroup setup for the child process
6541  * @child: the child process
6542  * @kargs: the arguments passed to create the child process
6543  *
6544  * Attach the child process to its css_set calling the subsystem fork()
6545  * callbacks.
6546  */
6547 void cgroup_post_fork(struct task_struct *child,
6548                       struct kernel_clone_args *kargs)
6549         __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6550 {
6551         unsigned long cgrp_flags = 0;
6552         bool kill = false;
6553         struct cgroup_subsys *ss;
6554         struct css_set *cset;
6555         int i;
6556
6557         cset = kargs->cset;
6558         kargs->cset = NULL;
6559
6560         spin_lock_irq(&css_set_lock);
6561
6562         /* init tasks are special, only link regular threads */
6563         if (likely(child->pid)) {
6564                 if (kargs->cgrp)
6565                         cgrp_flags = kargs->cgrp->flags;
6566                 else
6567                         cgrp_flags = cset->dfl_cgrp->flags;
6568
6569                 WARN_ON_ONCE(!list_empty(&child->cg_list));
6570                 cset->nr_tasks++;
6571                 css_set_move_task(child, NULL, cset, false);
6572         } else {
6573                 put_css_set(cset);
6574                 cset = NULL;
6575         }
6576
6577         if (!(child->flags & PF_KTHREAD)) {
6578                 if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
6579                         /*
6580                          * If the cgroup has to be frozen, the new task has
6581                          * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
6582                          * get the task into the frozen state.
6583                          */
6584                         spin_lock(&child->sighand->siglock);
6585                         WARN_ON_ONCE(child->frozen);
6586                         child->jobctl |= JOBCTL_TRAP_FREEZE;
6587                         spin_unlock(&child->sighand->siglock);
6588
6589                         /*
6590                          * Calling cgroup_update_frozen() isn't required here,
6591                          * because it will be called anyway a bit later from
6592                          * do_freezer_trap(). So we avoid cgroup's transient
6593                          * switch from the frozen state and back.
6594                          */
6595                 }
6596
6597                 /*
6598                  * If the cgroup is to be killed notice it now and take the
6599                  * child down right after we finished preparing it for
6600                  * userspace.
6601                  */
6602                 kill = test_bit(CGRP_KILL, &cgrp_flags);
6603         }
6604
6605         spin_unlock_irq(&css_set_lock);
6606
6607         /*
6608          * Call ss->fork().  This must happen after @child is linked on
6609          * css_set; otherwise, @child might change state between ->fork()
6610          * and addition to css_set.
6611          */
6612         do_each_subsys_mask(ss, i, have_fork_callback) {
6613                 ss->fork(child);
6614         } while_each_subsys_mask();
6615
6616         /* Make the new cset the root_cset of the new cgroup namespace. */
6617         if (kargs->flags & CLONE_NEWCGROUP) {
6618                 struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
6619
6620                 get_css_set(cset);
6621                 child->nsproxy->cgroup_ns->root_cset = cset;
6622                 put_css_set(rcset);
6623         }
6624
6625         /* Cgroup has to be killed so take down child immediately. */
6626         if (unlikely(kill))
6627                 do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
6628
6629         cgroup_css_set_put_fork(kargs);
6630 }
6631
6632 /**
6633  * cgroup_exit - detach cgroup from exiting task
6634  * @tsk: pointer to task_struct of exiting process
6635  *
6636  * Description: Detach cgroup from @tsk.
6637  *
6638  */
6639 void cgroup_exit(struct task_struct *tsk)
6640 {
6641         struct cgroup_subsys *ss;
6642         struct css_set *cset;
6643         int i;
6644
6645         spin_lock_irq(&css_set_lock);
6646
6647         WARN_ON_ONCE(list_empty(&tsk->cg_list));
6648         cset = task_css_set(tsk);
6649         css_set_move_task(tsk, cset, NULL, false);
6650         list_add_tail(&tsk->cg_list, &cset->dying_tasks);
6651         cset->nr_tasks--;
6652
6653         if (dl_task(tsk))
6654                 dec_dl_tasks_cs(tsk);
6655
6656         WARN_ON_ONCE(cgroup_task_frozen(tsk));
6657         if (unlikely(!(tsk->flags & PF_KTHREAD) &&
6658                      test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
6659                 cgroup_update_frozen(task_dfl_cgroup(tsk));
6660
6661         spin_unlock_irq(&css_set_lock);
6662
6663         /* see cgroup_post_fork() for details */
6664         do_each_subsys_mask(ss, i, have_exit_callback) {
6665                 ss->exit(tsk);
6666         } while_each_subsys_mask();
6667 }
6668
6669 void cgroup_release(struct task_struct *task)
6670 {
6671         struct cgroup_subsys *ss;
6672         int ssid;
6673
6674         do_each_subsys_mask(ss, ssid, have_release_callback) {
6675                 ss->release(task);
6676         } while_each_subsys_mask();
6677
6678         spin_lock_irq(&css_set_lock);
6679         css_set_skip_task_iters(task_css_set(task), task);
6680         list_del_init(&task->cg_list);
6681         spin_unlock_irq(&css_set_lock);
6682 }
6683
6684 void cgroup_free(struct task_struct *task)
6685 {
6686         struct css_set *cset = task_css_set(task);
6687         put_css_set(cset);
6688 }
6689
6690 static int __init cgroup_disable(char *str)
6691 {
6692         struct cgroup_subsys *ss;
6693         char *token;
6694         int i;
6695
6696         while ((token = strsep(&str, ",")) != NULL) {
6697                 if (!*token)
6698                         continue;
6699
6700                 for_each_subsys(ss, i) {
6701                         if (strcmp(token, ss->name) &&
6702                             strcmp(token, ss->legacy_name))
6703                                 continue;
6704
6705                         static_branch_disable(cgroup_subsys_enabled_key[i]);
6706                         pr_info("Disabling %s control group subsystem\n",
6707                                 ss->name);
6708                 }
6709
6710                 for (i = 0; i < OPT_FEATURE_COUNT; i++) {
6711                         if (strcmp(token, cgroup_opt_feature_names[i]))
6712                                 continue;
6713                         cgroup_feature_disable_mask |= 1 << i;
6714                         pr_info("Disabling %s control group feature\n",
6715                                 cgroup_opt_feature_names[i]);
6716                         break;
6717                 }
6718         }
6719         return 1;
6720 }
6721 __setup("cgroup_disable=", cgroup_disable);
6722
6723 void __init __weak enable_debug_cgroup(void) { }
6724
6725 static int __init enable_cgroup_debug(char *str)
6726 {
6727         cgroup_debug = true;
6728         enable_debug_cgroup();
6729         return 1;
6730 }
6731 __setup("cgroup_debug", enable_cgroup_debug);
6732
6733 /**
6734  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
6735  * @dentry: directory dentry of interest
6736  * @ss: subsystem of interest
6737  *
6738  * If @dentry is a directory for a cgroup which has @ss enabled on it, try
6739  * to get the corresponding css and return it.  If such css doesn't exist
6740  * or can't be pinned, an ERR_PTR value is returned.
6741  */
6742 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
6743                                                        struct cgroup_subsys *ss)
6744 {
6745         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
6746         struct file_system_type *s_type = dentry->d_sb->s_type;
6747         struct cgroup_subsys_state *css = NULL;
6748         struct cgroup *cgrp;
6749
6750         /* is @dentry a cgroup dir? */
6751         if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
6752             !kn || kernfs_type(kn) != KERNFS_DIR)
6753                 return ERR_PTR(-EBADF);
6754
6755         rcu_read_lock();
6756
6757         /*
6758          * This path doesn't originate from kernfs and @kn could already
6759          * have been or be removed at any point.  @kn->priv is RCU
6760          * protected for this access.  See css_release_work_fn() for details.
6761          */
6762         cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
6763         if (cgrp)
6764                 css = cgroup_css(cgrp, ss);
6765
6766         if (!css || !css_tryget_online(css))
6767                 css = ERR_PTR(-ENOENT);
6768
6769         rcu_read_unlock();
6770         return css;
6771 }
6772
6773 /**
6774  * css_from_id - lookup css by id
6775  * @id: the cgroup id
6776  * @ss: cgroup subsys to be looked into
6777  *
6778  * Returns the css if there's valid one with @id, otherwise returns NULL.
6779  * Should be called under rcu_read_lock().
6780  */
6781 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
6782 {
6783         WARN_ON_ONCE(!rcu_read_lock_held());
6784         return idr_find(&ss->css_idr, id);
6785 }
6786
6787 /**
6788  * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
6789  * @path: path on the default hierarchy
6790  *
6791  * Find the cgroup at @path on the default hierarchy, increment its
6792  * reference count and return it.  Returns pointer to the found cgroup on
6793  * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already
6794  * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory.
6795  */
6796 struct cgroup *cgroup_get_from_path(const char *path)
6797 {
6798         struct kernfs_node *kn;
6799         struct cgroup *cgrp = ERR_PTR(-ENOENT);
6800         struct cgroup *root_cgrp;
6801
6802         root_cgrp = current_cgns_cgroup_dfl();
6803         kn = kernfs_walk_and_get(root_cgrp->kn, path);
6804         if (!kn)
6805                 goto out;
6806
6807         if (kernfs_type(kn) != KERNFS_DIR) {
6808                 cgrp = ERR_PTR(-ENOTDIR);
6809                 goto out_kernfs;
6810         }
6811
6812         rcu_read_lock();
6813
6814         cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
6815         if (!cgrp || !cgroup_tryget(cgrp))
6816                 cgrp = ERR_PTR(-ENOENT);
6817
6818         rcu_read_unlock();
6819
6820 out_kernfs:
6821         kernfs_put(kn);
6822 out:
6823         return cgrp;
6824 }
6825 EXPORT_SYMBOL_GPL(cgroup_get_from_path);
6826
6827 /**
6828  * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd
6829  * @fd: fd obtained by open(cgroup_dir)
6830  *
6831  * Find the cgroup from a fd which should be obtained
6832  * by opening a cgroup directory.  Returns a pointer to the
6833  * cgroup on success. ERR_PTR is returned if the cgroup
6834  * cannot be found.
6835  */
6836 struct cgroup *cgroup_v1v2_get_from_fd(int fd)
6837 {
6838         struct cgroup *cgrp;
6839         struct fd f = fdget_raw(fd);
6840         if (!f.file)
6841                 return ERR_PTR(-EBADF);
6842
6843         cgrp = cgroup_v1v2_get_from_file(f.file);
6844         fdput(f);
6845         return cgrp;
6846 }
6847
6848 /**
6849  * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports
6850  * cgroup2.
6851  * @fd: fd obtained by open(cgroup2_dir)
6852  */
6853 struct cgroup *cgroup_get_from_fd(int fd)
6854 {
6855         struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd);
6856
6857         if (IS_ERR(cgrp))
6858                 return ERR_CAST(cgrp);
6859
6860         if (!cgroup_on_dfl(cgrp)) {
6861                 cgroup_put(cgrp);
6862                 return ERR_PTR(-EBADF);
6863         }
6864         return cgrp;
6865 }
6866 EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
6867
6868 static u64 power_of_ten(int power)
6869 {
6870         u64 v = 1;
6871         while (power--)
6872                 v *= 10;
6873         return v;
6874 }
6875
6876 /**
6877  * cgroup_parse_float - parse a floating number
6878  * @input: input string
6879  * @dec_shift: number of decimal digits to shift
6880  * @v: output
6881  *
6882  * Parse a decimal floating point number in @input and store the result in
6883  * @v with decimal point right shifted @dec_shift times.  For example, if
6884  * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.
6885  * Returns 0 on success, -errno otherwise.
6886  *
6887  * There's nothing cgroup specific about this function except that it's
6888  * currently the only user.
6889  */
6890 int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
6891 {
6892         s64 whole, frac = 0;
6893         int fstart = 0, fend = 0, flen;
6894
6895         if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
6896                 return -EINVAL;
6897         if (frac < 0)
6898                 return -EINVAL;
6899
6900         flen = fend > fstart ? fend - fstart : 0;
6901         if (flen < dec_shift)
6902                 frac *= power_of_ten(dec_shift - flen);
6903         else
6904                 frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
6905
6906         *v = whole * power_of_ten(dec_shift) + frac;
6907         return 0;
6908 }
6909
6910 /*
6911  * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
6912  * definition in cgroup-defs.h.
6913  */
6914 #ifdef CONFIG_SOCK_CGROUP_DATA
6915
6916 void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6917 {
6918         struct cgroup *cgroup;
6919
6920         rcu_read_lock();
6921         /* Don't associate the sock with unrelated interrupted task's cgroup. */
6922         if (in_interrupt()) {
6923                 cgroup = &cgrp_dfl_root.cgrp;
6924                 cgroup_get(cgroup);
6925                 goto out;
6926         }
6927
6928         while (true) {
6929                 struct css_set *cset;
6930
6931                 cset = task_css_set(current);
6932                 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
6933                         cgroup = cset->dfl_cgrp;
6934                         break;
6935                 }
6936                 cpu_relax();
6937         }
6938 out:
6939         skcd->cgroup = cgroup;
6940         cgroup_bpf_get(cgroup);
6941         rcu_read_unlock();
6942 }
6943
6944 void cgroup_sk_clone(struct sock_cgroup_data *skcd)
6945 {
6946         struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6947
6948         /*
6949          * We might be cloning a socket which is left in an empty
6950          * cgroup and the cgroup might have already been rmdir'd.
6951          * Don't use cgroup_get_live().
6952          */
6953         cgroup_get(cgrp);
6954         cgroup_bpf_get(cgrp);
6955 }
6956
6957 void cgroup_sk_free(struct sock_cgroup_data *skcd)
6958 {
6959         struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6960
6961         cgroup_bpf_put(cgrp);
6962         cgroup_put(cgrp);
6963 }
6964
6965 #endif  /* CONFIG_SOCK_CGROUP_DATA */
6966
6967 #ifdef CONFIG_SYSFS
6968 static ssize_t show_delegatable_files(struct cftype *files, char *buf,
6969                                       ssize_t size, const char *prefix)
6970 {
6971         struct cftype *cft;
6972         ssize_t ret = 0;
6973
6974         for (cft = files; cft && cft->name[0] != '\0'; cft++) {
6975                 if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
6976                         continue;
6977
6978                 if (prefix)
6979                         ret += snprintf(buf + ret, size - ret, "%s.", prefix);
6980
6981                 ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
6982
6983                 if (WARN_ON(ret >= size))
6984                         break;
6985         }
6986
6987         return ret;
6988 }
6989
6990 static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
6991                               char *buf)
6992 {
6993         struct cgroup_subsys *ss;
6994         int ssid;
6995         ssize_t ret = 0;
6996
6997         ret = show_delegatable_files(cgroup_base_files, buf + ret,
6998                                      PAGE_SIZE - ret, NULL);
6999         if (cgroup_psi_enabled())
7000                 ret += show_delegatable_files(cgroup_psi_files, buf + ret,
7001                                               PAGE_SIZE - ret, NULL);
7002
7003         for_each_subsys(ss, ssid)
7004                 ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
7005                                               PAGE_SIZE - ret,
7006                                               cgroup_subsys_name[ssid]);
7007
7008         return ret;
7009 }
7010 static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
7011
7012 static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
7013                              char *buf)
7014 {
7015         return snprintf(buf, PAGE_SIZE,
7016                         "nsdelegate\n"
7017                         "favordynmods\n"
7018                         "memory_localevents\n"
7019                         "memory_recursiveprot\n");
7020 }
7021 static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
7022
7023 static struct attribute *cgroup_sysfs_attrs[] = {
7024         &cgroup_delegate_attr.attr,
7025         &cgroup_features_attr.attr,
7026         NULL,
7027 };
7028
7029 static const struct attribute_group cgroup_sysfs_attr_group = {
7030         .attrs = cgroup_sysfs_attrs,
7031         .name = "cgroup",
7032 };
7033
7034 static int __init cgroup_sysfs_init(void)
7035 {
7036         return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
7037 }
7038 subsys_initcall(cgroup_sysfs_init);
7039
7040 #endif /* CONFIG_SYSFS */