kernel/cgroup/cpuset.c

   1 /*
   2  *  kernel/cpuset.c
   3  *
   4  *  Processor and Memory placement constraints for sets of tasks.
   5  *
   6  *  Copyright (C) 2003 BULL SA.
   7  *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
   8  *  Copyright (C) 2006 Google, Inc
   9  *
  10  *  Portions derived from Patrick Mochel's sysfs code.
  11  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
  12  *
  13  *  2003-10-10 Written by Simon Derr.
  14  *  2003-10-22 Updates by Stephen Hemminger.
  15  *  2004 May-July Rework by Paul Jackson.
  16  *  2006 Rework by Paul Menage to use generic cgroups
  17  *  2008 Rework of the scheduler domains and CPU hotplug handling
  18  *       by Max Krasnyansky
  19  *
  20  *  This file is subject to the terms and conditions of the GNU General Public
  21  *  License.  See the file COPYING in the main directory of the Linux
  22  *  distribution for more details.
  23  */
  24
  25 #include <linux/cpu.h>
  26 #include <linux/cpumask.h>
  27 #include <linux/cpuset.h>
  28 #include <linux/err.h>
  29 #include <linux/errno.h>
  30 #include <linux/file.h>
  31 #include <linux/fs.h>
  32 #include <linux/init.h>
  33 #include <linux/interrupt.h>
  34 #include <linux/kernel.h>
  35 #include <linux/kmod.h>
  36 #include <linux/list.h>
  37 #include <linux/mempolicy.h>
  38 #include <linux/mm.h>
  39 #include <linux/memory.h>
  40 #include <linux/export.h>
  41 #include <linux/mount.h>
  42 #include <linux/fs_context.h>
  43 #include <linux/namei.h>
  44 #include <linux/pagemap.h>
  45 #include <linux/proc_fs.h>
  46 #include <linux/rcupdate.h>
  47 #include <linux/sched.h>
  48 #include <linux/sched/deadline.h>
  49 #include <linux/sched/mm.h>
  50 #include <linux/sched/task.h>
  51 #include <linux/seq_file.h>
  52 #include <linux/security.h>
  53 #include <linux/slab.h>
  54 #include <linux/spinlock.h>
  55 #include <linux/stat.h>
  56 #include <linux/string.h>
  57 #include <linux/time.h>
  58 #include <linux/time64.h>
  59 #include <linux/backing-dev.h>
  60 #include <linux/sort.h>
  61 #include <linux/oom.h>
  62 #include <linux/sched/isolation.h>
  63 #include <linux/uaccess.h>
  64 #include <linux/atomic.h>
  65 #include <linux/mutex.h>
  66 #include <linux/cgroup.h>
  67 #include <linux/wait.h>
  68
  69 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
  70 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
  71
  72 /*
  73  * There could be abnormal cpuset configurations for cpu or memory
  74  * node binding, add this key to provide a quick low-cost judgement
  75  * of the situation.
  76  */
  77 DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
  78
  79 /* See "Frequency meter" comments, below. */
  80
  81 struct fmeter {
  82         int cnt;                /* unprocessed events count */
  83         int val;                /* most recent output value */
  84         time64_t time;          /* clock (secs) when val computed */
  85         spinlock_t lock;        /* guards read or write of above */
  86 };
  87
  88 struct cpuset {
  89         struct cgroup_subsys_state css;
  90
  91         unsigned long flags;            /* "unsigned long" so bitops work */
  92
  93         /*
  94          * On default hierarchy:
  95          *
  96          * The user-configured masks can only be changed by writing to
  97          * cpuset.cpus and cpuset.mems, and won't be limited by the
  98          * parent masks.
  99          *
 100          * The effective masks is the real masks that apply to the tasks
 101          * in the cpuset. They may be changed if the configured masks are
 102          * changed or hotplug happens.
 103          *
 104          * effective_mask == configured_mask & parent's effective_mask,
 105          * and if it ends up empty, it will inherit the parent's mask.
 106          *
 107          *
 108          * On legacy hierarchy:
 109          *
 110          * The user-configured masks are always the same with effective masks.
 111          */
 112
 113         /* user-configured CPUs and Memory Nodes allow to tasks */
 114         cpumask_var_t cpus_allowed;
 115         nodemask_t mems_allowed;
 116
 117         /* effective CPUs and Memory Nodes allow to tasks */
 118         cpumask_var_t effective_cpus;
 119         nodemask_t effective_mems;
 120
 121         /*
 122          * CPUs allocated to child sub-partitions (default hierarchy only)
 123          * - CPUs granted by the parent = effective_cpus U subparts_cpus
 124          * - effective_cpus and subparts_cpus are mutually exclusive.
 125          *
 126          * effective_cpus contains only onlined CPUs, but subparts_cpus
 127          * may have offlined ones.
 128          */
 129         cpumask_var_t subparts_cpus;
 130
 131         /*
 132          * This is old Memory Nodes tasks took on.
 133          *
 134          * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
 135          * - A new cpuset's old_mems_allowed is initialized when some
 136          *   task is moved into it.
 137          * - old_mems_allowed is used in cpuset_migrate_mm() when we change
 138          *   cpuset.mems_allowed and have tasks' nodemask updated, and
 139          *   then old_mems_allowed is updated to mems_allowed.
 140          */
 141         nodemask_t old_mems_allowed;
 142
 143         struct fmeter fmeter;           /* memory_pressure filter */
 144
 145         /*
 146          * Tasks are being attached to this cpuset.  Used to prevent
 147          * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
 148          */
 149         int attach_in_progress;
 150
 151         /* partition number for rebuild_sched_domains() */
 152         int pn;
 153
 154         /* for custom sched domain */
 155         int relax_domain_level;
 156
 157         /* number of CPUs in subparts_cpus */
 158         int nr_subparts_cpus;
 159
 160         /* partition root state */
 161         int partition_root_state;
 162
 163         /*
 164          * Default hierarchy only:
 165          * use_parent_ecpus - set if using parent's effective_cpus
 166          * child_ecpus_count - # of children with use_parent_ecpus set
 167          */
 168         int use_parent_ecpus;
 169         int child_ecpus_count;
 170
 171         /* Handle for cpuset.cpus.partition */
 172         struct cgroup_file partition_file;
 173 };
 174
 175 /*
 176  * Partition root states:
 177  *
 178  *   0 - not a partition root
 179  *
 180  *   1 - partition root
 181  *
 182  *  -1 - invalid partition root
 183  *       None of the cpus in cpus_allowed can be put into the parent's
 184  *       subparts_cpus. In this case, the cpuset is not a real partition
 185  *       root anymore.  However, the CPU_EXCLUSIVE bit will still be set
 186  *       and the cpuset can be restored back to a partition root if the
 187  *       parent cpuset can give more CPUs back to this child cpuset.
 188  */
 189 #define PRS_DISABLED            0
 190 #define PRS_ENABLED             1
 191 #define PRS_ERROR               -1
 192
 193 /*
 194  * Temporary cpumasks for working with partitions that are passed among
 195  * functions to avoid memory allocation in inner functions.
 196  */
 197 struct tmpmasks {
 198         cpumask_var_t addmask, delmask; /* For partition root */
 199         cpumask_var_t new_cpus;         /* For update_cpumasks_hier() */
 200 };
 201
 202 static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
 203 {
 204         return css ? container_of(css, struct cpuset, css) : NULL;
 205 }
 206
 207 /* Retrieve the cpuset for a task */
 208 static inline struct cpuset *task_cs(struct task_struct *task)
 209 {
 210         return css_cs(task_css(task, cpuset_cgrp_id));
 211 }
 212
 213 static inline struct cpuset *parent_cs(struct cpuset *cs)
 214 {
 215         return css_cs(cs->css.parent);
 216 }
 217
 218 /* bits in struct cpuset flags field */
 219 typedef enum {
 220         CS_ONLINE,
 221         CS_CPU_EXCLUSIVE,
 222         CS_MEM_EXCLUSIVE,
 223         CS_MEM_HARDWALL,
 224         CS_MEMORY_MIGRATE,
 225         CS_SCHED_LOAD_BALANCE,
 226         CS_SPREAD_PAGE,
 227         CS_SPREAD_SLAB,
 228 } cpuset_flagbits_t;
 229
 230 /* convenient tests for these bits */
 231 static inline bool is_cpuset_online(struct cpuset *cs)
 232 {
 233         return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
 234 }
 235
 236 static inline int is_cpu_exclusive(const struct cpuset *cs)
 237 {
 238         return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
 239 }
 240
 241 static inline int is_mem_exclusive(const struct cpuset *cs)
 242 {
 243         return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
 244 }
 245
 246 static inline int is_mem_hardwall(const struct cpuset *cs)
 247 {
 248         return test_bit(CS_MEM_HARDWALL, &cs->flags);
 249 }
 250
 251 static inline int is_sched_load_balance(const struct cpuset *cs)
 252 {
 253         return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 254 }
 255
 256 static inline int is_memory_migrate(const struct cpuset *cs)
 257 {
 258         return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
 259 }
 260
 261 static inline int is_spread_page(const struct cpuset *cs)
 262 {
 263         return test_bit(CS_SPREAD_PAGE, &cs->flags);
 264 }
 265
 266 static inline int is_spread_slab(const struct cpuset *cs)
 267 {
 268         return test_bit(CS_SPREAD_SLAB, &cs->flags);
 269 }
 270
 271 static inline int is_partition_root(const struct cpuset *cs)
 272 {
 273         return cs->partition_root_state > 0;
 274 }
 275
 276 /*
 277  * Send notification event of whenever partition_root_state changes.
 278  */
 279 static inline void notify_partition_change(struct cpuset *cs,
 280                                            int old_prs, int new_prs)
 281 {
 282         if (old_prs != new_prs)
 283                 cgroup_file_notify(&cs->partition_file);
 284 }
 285
 286 static struct cpuset top_cpuset = {
 287         .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
 288                   (1 << CS_MEM_EXCLUSIVE)),
 289         .partition_root_state = PRS_ENABLED,
 290 };
 291
 292 /**
 293  * cpuset_for_each_child - traverse online children of a cpuset
 294  * @child_cs: loop cursor pointing to the current child
 295  * @pos_css: used for iteration
 296  * @parent_cs: target cpuset to walk children of
 297  *
 298  * Walk @child_cs through the online children of @parent_cs.  Must be used
 299  * with RCU read locked.
 300  */
 301 #define cpuset_for_each_child(child_cs, pos_css, parent_cs)             \
 302         css_for_each_child((pos_css), &(parent_cs)->css)                \
 303                 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
 304
 305 /**
 306  * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
 307  * @des_cs: loop cursor pointing to the current descendant
 308  * @pos_css: used for iteration
 309  * @root_cs: target cpuset to walk ancestor of
 310  *
 311  * Walk @des_cs through the online descendants of @root_cs.  Must be used
 312  * with RCU read locked.  The caller may modify @pos_css by calling
 313  * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
 314  * iteration and the first node to be visited.
 315  */
 316 #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)        \
 317         css_for_each_descendant_pre((pos_css), &(root_cs)->css)         \
 318                 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
 319
 320 /*
 321  * There are two global locks guarding cpuset structures - cpuset_rwsem and
 322  * callback_lock. We also require taking task_lock() when dereferencing a
 323  * task's cpuset pointer. See "The task_lock() exception", at the end of this
 324  * comment.  The cpuset code uses only cpuset_rwsem write lock.  Other
 325  * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to
 326  * prevent change to cpuset structures.
 327  *
 328  * A task must hold both locks to modify cpusets.  If a task holds
 329  * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it
 330  * is the only task able to also acquire callback_lock and be able to
 331  * modify cpusets.  It can perform various checks on the cpuset structure
 332  * first, knowing nothing will change.  It can also allocate memory while
 333  * just holding cpuset_rwsem.  While it is performing these checks, various
 334  * callback routines can briefly acquire callback_lock to query cpusets.
 335  * Once it is ready to make the changes, it takes callback_lock, blocking
 336  * everyone else.
 337  *
 338  * Calls to the kernel memory allocator can not be made while holding
 339  * callback_lock, as that would risk double tripping on callback_lock
 340  * from one of the callbacks into the cpuset code from within
 341  * __alloc_pages().
 342  *
 343  * If a task is only holding callback_lock, then it has read-only
 344  * access to cpusets.
 345  *
 346  * Now, the task_struct fields mems_allowed and mempolicy may be changed
 347  * by other task, we use alloc_lock in the task_struct fields to protect
 348  * them.
 349  *
 350  * The cpuset_common_file_read() handlers only hold callback_lock across
 351  * small pieces of code, such as when reading out possibly multi-word
 352  * cpumasks and nodemasks.
 353  *
 354  * Accessing a task's cpuset should be done in accordance with the
 355  * guidelines for accessing subsystem state in kernel/cgroup.c
 356  */
 357
 358 DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
 359
 360 void cpuset_read_lock(void)
 361 {
 362         percpu_down_read(&cpuset_rwsem);
 363 }
 364
 365 void cpuset_read_unlock(void)
 366 {
 367         percpu_up_read(&cpuset_rwsem);
 368 }
 369
 370 static DEFINE_SPINLOCK(callback_lock);
 371
 372 static struct workqueue_struct *cpuset_migrate_mm_wq;
 373
 374 /*
 375  * CPU / memory hotplug is handled asynchronously.
 376  */
 377 static void cpuset_hotplug_workfn(struct work_struct *work);
 378 static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
 379
 380 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
 381
 382 static inline void check_insane_mems_config(nodemask_t *nodes)
 383 {
 384         if (!cpusets_insane_config() &&
 385                 movable_only_nodes(nodes)) {
 386                 static_branch_enable(&cpusets_insane_config_key);
 387                 pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
 388                         "Cpuset allocations might fail even with a lot of memory available.\n",
 389                         nodemask_pr_args(nodes));
 390         }
 391 }
 392
 393 /*
 394  * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
 395  * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
 396  * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
 397  * With v2 behavior, "cpus" and "mems" are always what the users have
 398  * requested and won't be changed by hotplug events. Only the effective
 399  * cpus or mems will be affected.
 400  */
 401 static inline bool is_in_v2_mode(void)
 402 {
 403         return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
 404               (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
 405 }
 406
 407 /*
 408  * Return in pmask the portion of a task's cpusets's cpus_allowed that
 409  * are online and are capable of running the task.  If none are found,
 410  * walk up the cpuset hierarchy until we find one that does have some
 411  * appropriate cpus.
 412  *
 413  * One way or another, we guarantee to return some non-empty subset
 414  * of cpu_online_mask.
 415  *
 416  * Call with callback_lock or cpuset_rwsem held.
 417  */
 418 static void guarantee_online_cpus(struct task_struct *tsk,
 419                                   struct cpumask *pmask)
 420 {
 421         const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
 422         struct cpuset *cs;
 423
 424         if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
 425                 cpumask_copy(pmask, cpu_online_mask);
 426
 427         rcu_read_lock();
 428         cs = task_cs(tsk);
 429
 430         while (!cpumask_intersects(cs->effective_cpus, pmask)) {
 431                 cs = parent_cs(cs);
 432                 if (unlikely(!cs)) {
 433                         /*
 434                          * The top cpuset doesn't have any online cpu as a
 435                          * consequence of a race between cpuset_hotplug_work
 436                          * and cpu hotplug notifier.  But we know the top
 437                          * cpuset's effective_cpus is on its way to be
 438                          * identical to cpu_online_mask.
 439                          */
 440                         goto out_unlock;
 441                 }
 442         }
 443         cpumask_and(pmask, pmask, cs->effective_cpus);
 444
 445 out_unlock:
 446         rcu_read_unlock();
 447 }
 448
 449 /*
 450  * Return in *pmask the portion of a cpusets's mems_allowed that
 451  * are online, with memory.  If none are online with memory, walk
 452  * up the cpuset hierarchy until we find one that does have some
 453  * online mems.  The top cpuset always has some mems online.
 454  *
 455  * One way or another, we guarantee to return some non-empty subset
 456  * of node_states[N_MEMORY].
 457  *
 458  * Call with callback_lock or cpuset_rwsem held.
 459  */
 460 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
 461 {
 462         while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
 463                 cs = parent_cs(cs);
 464         nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
 465 }
 466
 467 /*
 468  * update task's spread flag if cpuset's page/slab spread flag is set
 469  *
 470  * Call with callback_lock or cpuset_rwsem held.
 471  */
 472 static void cpuset_update_task_spread_flag(struct cpuset *cs,
 473                                         struct task_struct *tsk)
 474 {
 475         if (is_spread_page(cs))
 476                 task_set_spread_page(tsk);
 477         else
 478                 task_clear_spread_page(tsk);
 479
 480         if (is_spread_slab(cs))
 481                 task_set_spread_slab(tsk);
 482         else
 483                 task_clear_spread_slab(tsk);
 484 }
 485
 486 /*
 487  * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
 488  *
 489  * One cpuset is a subset of another if all its allowed CPUs and
 490  * Memory Nodes are a subset of the other, and its exclusive flags
 491  * are only set if the other's are set.  Call holding cpuset_rwsem.
 492  */
 493
 494 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 495 {
 496         return  cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
 497                 nodes_subset(p->mems_allowed, q->mems_allowed) &&
 498                 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
 499                 is_mem_exclusive(p) <= is_mem_exclusive(q);
 500 }
 501
 502 /**
 503  * alloc_cpumasks - allocate three cpumasks for cpuset
 504  * @cs:  the cpuset that have cpumasks to be allocated.
 505  * @tmp: the tmpmasks structure pointer
 506  * Return: 0 if successful, -ENOMEM otherwise.
 507  *
 508  * Only one of the two input arguments should be non-NULL.
 509  */
 510 static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 511 {
 512         cpumask_var_t *pmask1, *pmask2, *pmask3;
 513
 514         if (cs) {
 515                 pmask1 = &cs->cpus_allowed;
 516                 pmask2 = &cs->effective_cpus;
 517                 pmask3 = &cs->subparts_cpus;
 518         } else {
 519                 pmask1 = &tmp->new_cpus;
 520                 pmask2 = &tmp->addmask;
 521                 pmask3 = &tmp->delmask;
 522         }
 523
 524         if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
 525                 return -ENOMEM;
 526
 527         if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
 528                 goto free_one;
 529
 530         if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
 531                 goto free_two;
 532
 533         return 0;
 534
 535 free_two:
 536         free_cpumask_var(*pmask2);
 537 free_one:
 538         free_cpumask_var(*pmask1);
 539         return -ENOMEM;
 540 }
 541
 542 /**
 543  * free_cpumasks - free cpumasks in a tmpmasks structure
 544  * @cs:  the cpuset that have cpumasks to be free.
 545  * @tmp: the tmpmasks structure pointer
 546  */
 547 static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 548 {
 549         if (cs) {
 550                 free_cpumask_var(cs->cpus_allowed);
 551                 free_cpumask_var(cs->effective_cpus);
 552                 free_cpumask_var(cs->subparts_cpus);
 553         }
 554         if (tmp) {
 555                 free_cpumask_var(tmp->new_cpus);
 556                 free_cpumask_var(tmp->addmask);
 557                 free_cpumask_var(tmp->delmask);
 558         }
 559 }
 560
 561 /**
 562  * alloc_trial_cpuset - allocate a trial cpuset
 563  * @cs: the cpuset that the trial cpuset duplicates
 564  */
 565 static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
 566 {
 567         struct cpuset *trial;
 568
 569         trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
 570         if (!trial)
 571                 return NULL;
 572
 573         if (alloc_cpumasks(trial, NULL)) {
 574                 kfree(trial);
 575                 return NULL;
 576         }
 577
 578         cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
 579         cpumask_copy(trial->effective_cpus, cs->effective_cpus);
 580         return trial;
 581 }
 582
 583 /**
 584  * free_cpuset - free the cpuset
 585  * @cs: the cpuset to be freed
 586  */
 587 static inline void free_cpuset(struct cpuset *cs)
 588 {
 589         free_cpumasks(cs, NULL);
 590         kfree(cs);
 591 }
 592
 593 /*
 594  * validate_change() - Used to validate that any proposed cpuset change
 595  *                     follows the structural rules for cpusets.
 596  *
 597  * If we replaced the flag and mask values of the current cpuset
 598  * (cur) with those values in the trial cpuset (trial), would
 599  * our various subset and exclusive rules still be valid?  Presumes
 600  * cpuset_rwsem held.
 601  *
 602  * 'cur' is the address of an actual, in-use cpuset.  Operations
 603  * such as list traversal that depend on the actual address of the
 604  * cpuset in the list must use cur below, not trial.
 605  *
 606  * 'trial' is the address of bulk structure copy of cur, with
 607  * perhaps one or more of the fields cpus_allowed, mems_allowed,
 608  * or flags changed to new, trial values.
 609  *
 610  * Return 0 if valid, -errno if not.
 611  */
 612
 613 static int validate_change(struct cpuset *cur, struct cpuset *trial)
 614 {
 615         struct cgroup_subsys_state *css;
 616         struct cpuset *c, *par;
 617         int ret;
 618
 619         /* The checks don't apply to root cpuset */
 620         if (cur == &top_cpuset)
 621                 return 0;
 622
 623         rcu_read_lock();
 624         par = parent_cs(cur);
 625
 626         /* On legacy hierarchy, we must be a subset of our parent cpuset. */
 627         ret = -EACCES;
 628         if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
 629                 goto out;
 630
 631         /*
 632          * If either I or some sibling (!= me) is exclusive, we can't
 633          * overlap
 634          */
 635         ret = -EINVAL;
 636         cpuset_for_each_child(c, css, par) {
 637                 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
 638                     c != cur &&
 639                     cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
 640                         goto out;
 641                 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
 642                     c != cur &&
 643                     nodes_intersects(trial->mems_allowed, c->mems_allowed))
 644                         goto out;
 645         }
 646
 647         /*
 648          * Cpusets with tasks - existing or newly being attached - can't
 649          * be changed to have empty cpus_allowed or mems_allowed.
 650          */
 651         ret = -ENOSPC;
 652         if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
 653                 if (!cpumask_empty(cur->cpus_allowed) &&
 654                     cpumask_empty(trial->cpus_allowed))
 655                         goto out;
 656                 if (!nodes_empty(cur->mems_allowed) &&
 657                     nodes_empty(trial->mems_allowed))
 658                         goto out;
 659         }
 660
 661         /*
 662          * We can't shrink if we won't have enough room for SCHED_DEADLINE
 663          * tasks.
 664          */
 665         ret = -EBUSY;
 666         if (is_cpu_exclusive(cur) &&
 667             !cpuset_cpumask_can_shrink(cur->cpus_allowed,
 668                                        trial->cpus_allowed))
 669                 goto out;
 670
 671         ret = 0;
 672 out:
 673         rcu_read_unlock();
 674         return ret;
 675 }
 676
 677 #ifdef CONFIG_SMP
 678 /*
 679  * Helper routine for generate_sched_domains().
 680  * Do cpusets a, b have overlapping effective cpus_allowed masks?
 681  */
 682 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 683 {
 684         return cpumask_intersects(a->effective_cpus, b->effective_cpus);
 685 }
 686
 687 static void
 688 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 689 {
 690         if (dattr->relax_domain_level < c->relax_domain_level)
 691                 dattr->relax_domain_level = c->relax_domain_level;
 692         return;
 693 }
 694
 695 static void update_domain_attr_tree(struct sched_domain_attr *dattr,
 696                                     struct cpuset *root_cs)
 697 {
 698         struct cpuset *cp;
 699         struct cgroup_subsys_state *pos_css;
 700
 701         rcu_read_lock();
 702         cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
 703                 /* skip the whole subtree if @cp doesn't have any CPU */
 704                 if (cpumask_empty(cp->cpus_allowed)) {
 705                         pos_css = css_rightmost_descendant(pos_css);
 706                         continue;
 707                 }
 708
 709                 if (is_sched_load_balance(cp))
 710                         update_domain_attr(dattr, cp);
 711         }
 712         rcu_read_unlock();
 713 }
 714
 715 /* Must be called with cpuset_rwsem held.  */
 716 static inline int nr_cpusets(void)
 717 {
 718         /* jump label reference count + the top-level cpuset */
 719         return static_key_count(&cpusets_enabled_key.key) + 1;
 720 }
 721
 722 /*
 723  * generate_sched_domains()
 724  *
 725  * This function builds a partial partition of the systems CPUs
 726  * A 'partial partition' is a set of non-overlapping subsets whose
 727  * union is a subset of that set.
 728  * The output of this function needs to be passed to kernel/sched/core.c
 729  * partition_sched_domains() routine, which will rebuild the scheduler's
 730  * load balancing domains (sched domains) as specified by that partial
 731  * partition.
 732  *
 733  * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
 734  * for a background explanation of this.
 735  *
 736  * Does not return errors, on the theory that the callers of this
 737  * routine would rather not worry about failures to rebuild sched
 738  * domains when operating in the severe memory shortage situations
 739  * that could cause allocation failures below.
 740  *
 741  * Must be called with cpuset_rwsem held.
 742  *
 743  * The three key local variables below are:
 744  *    cp - cpuset pointer, used (together with pos_css) to perform a
 745  *         top-down scan of all cpusets. For our purposes, rebuilding
 746  *         the schedulers sched domains, we can ignore !is_sched_load_
 747  *         balance cpusets.
 748  *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
 749  *         that need to be load balanced, for convenient iterative
 750  *         access by the subsequent code that finds the best partition,
 751  *         i.e the set of domains (subsets) of CPUs such that the
 752  *         cpus_allowed of every cpuset marked is_sched_load_balance
 753  *         is a subset of one of these domains, while there are as
 754  *         many such domains as possible, each as small as possible.
 755  * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
 756  *         the kernel/sched/core.c routine partition_sched_domains() in a
 757  *         convenient format, that can be easily compared to the prior
 758  *         value to determine what partition elements (sched domains)
 759  *         were changed (added or removed.)
 760  *
 761  * Finding the best partition (set of domains):
 762  *      The triple nested loops below over i, j, k scan over the
 763  *      load balanced cpusets (using the array of cpuset pointers in
 764  *      csa[]) looking for pairs of cpusets that have overlapping
 765  *      cpus_allowed, but which don't have the same 'pn' partition
 766  *      number and gives them in the same partition number.  It keeps
 767  *      looping on the 'restart' label until it can no longer find
 768  *      any such pairs.
 769  *
 770  *      The union of the cpus_allowed masks from the set of
 771  *      all cpusets having the same 'pn' value then form the one
 772  *      element of the partition (one sched domain) to be passed to
 773  *      partition_sched_domains().
 774  */
 775 static int generate_sched_domains(cpumask_var_t **domains,
 776                         struct sched_domain_attr **attributes)
 777 {
 778         struct cpuset *cp;      /* top-down scan of cpusets */
 779         struct cpuset **csa;    /* array of all cpuset ptrs */
 780         int csn;                /* how many cpuset ptrs in csa so far */
 781         int i, j, k;            /* indices for partition finding loops */
 782         cpumask_var_t *doms;    /* resulting partition; i.e. sched domains */
 783         struct sched_domain_attr *dattr;  /* attributes for custom domains */
 784         int ndoms = 0;          /* number of sched domains in result */
 785         int nslot;              /* next empty doms[] struct cpumask slot */
 786         struct cgroup_subsys_state *pos_css;
 787         bool root_load_balance = is_sched_load_balance(&top_cpuset);
 788
 789         doms = NULL;
 790         dattr = NULL;
 791         csa = NULL;
 792
 793         /* Special case for the 99% of systems with one, full, sched domain */
 794         if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
 795                 ndoms = 1;
 796                 doms = alloc_sched_domains(ndoms);
 797                 if (!doms)
 798                         goto done;
 799
 800                 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
 801                 if (dattr) {
 802                         *dattr = SD_ATTR_INIT;
 803                         update_domain_attr_tree(dattr, &top_cpuset);
 804                 }
 805                 cpumask_and(doms[0], top_cpuset.effective_cpus,
 806                             housekeeping_cpumask(HK_FLAG_DOMAIN));
 807
 808                 goto done;
 809         }
 810
 811         csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
 812         if (!csa)
 813                 goto done;
 814         csn = 0;
 815
 816         rcu_read_lock();
 817         if (root_load_balance)
 818                 csa[csn++] = &top_cpuset;
 819         cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
 820                 if (cp == &top_cpuset)
 821                         continue;
 822                 /*
 823                  * Continue traversing beyond @cp iff @cp has some CPUs and
 824                  * isn't load balancing.  The former is obvious.  The
 825                  * latter: All child cpusets contain a subset of the
 826                  * parent's cpus, so just skip them, and then we call
 827                  * update_domain_attr_tree() to calc relax_domain_level of
 828                  * the corresponding sched domain.
 829                  *
 830                  * If root is load-balancing, we can skip @cp if it
 831                  * is a subset of the root's effective_cpus.
 832                  */
 833                 if (!cpumask_empty(cp->cpus_allowed) &&
 834                     !(is_sched_load_balance(cp) &&
 835                       cpumask_intersects(cp->cpus_allowed,
 836                                          housekeeping_cpumask(HK_FLAG_DOMAIN))))
 837                         continue;
 838
 839                 if (root_load_balance &&
 840                     cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
 841                         continue;
 842
 843                 if (is_sched_load_balance(cp) &&
 844                     !cpumask_empty(cp->effective_cpus))
 845                         csa[csn++] = cp;
 846
 847                 /* skip @cp's subtree if not a partition root */
 848                 if (!is_partition_root(cp))
 849                         pos_css = css_rightmost_descendant(pos_css);
 850         }
 851         rcu_read_unlock();
 852
 853         for (i = 0; i < csn; i++)
 854                 csa[i]->pn = i;
 855         ndoms = csn;
 856
 857 restart:
 858         /* Find the best partition (set of sched domains) */
 859         for (i = 0; i < csn; i++) {
 860                 struct cpuset *a = csa[i];
 861                 int apn = a->pn;
 862
 863                 for (j = 0; j < csn; j++) {
 864                         struct cpuset *b = csa[j];
 865                         int bpn = b->pn;
 866
 867                         if (apn != bpn && cpusets_overlap(a, b)) {
 868                                 for (k = 0; k < csn; k++) {
 869                                         struct cpuset *c = csa[k];
 870
 871                                         if (c->pn == bpn)
 872                                                 c->pn = apn;
 873                                 }
 874                                 ndoms--;        /* one less element */
 875                                 goto restart;
 876                         }
 877                 }
 878         }
 879
 880         /*
 881          * Now we know how many domains to create.
 882          * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
 883          */
 884         doms = alloc_sched_domains(ndoms);
 885         if (!doms)
 886                 goto done;
 887
 888         /*
 889          * The rest of the code, including the scheduler, can deal with
 890          * dattr==NULL case. No need to abort if alloc fails.
 891          */
 892         dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
 893                               GFP_KERNEL);
 894
 895         for (nslot = 0, i = 0; i < csn; i++) {
 896                 struct cpuset *a = csa[i];
 897                 struct cpumask *dp;
 898                 int apn = a->pn;
 899
 900                 if (apn < 0) {
 901                         /* Skip completed partitions */
 902                         continue;
 903                 }
 904
 905                 dp = doms[nslot];
 906
 907                 if (nslot == ndoms) {
 908                         static int warnings = 10;
 909                         if (warnings) {
 910                                 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
 911                                         nslot, ndoms, csn, i, apn);
 912                                 warnings--;
 913                         }
 914                         continue;
 915                 }
 916
 917                 cpumask_clear(dp);
 918                 if (dattr)
 919                         *(dattr + nslot) = SD_ATTR_INIT;
 920                 for (j = i; j < csn; j++) {
 921                         struct cpuset *b = csa[j];
 922
 923                         if (apn == b->pn) {
 924                                 cpumask_or(dp, dp, b->effective_cpus);
 925                                 cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
 926                                 if (dattr)
 927                                         update_domain_attr_tree(dattr + nslot, b);
 928
 929                                 /* Done with this partition */
 930                                 b->pn = -1;
 931                         }
 932                 }
 933                 nslot++;
 934         }
 935         BUG_ON(nslot != ndoms);
 936
 937 done:
 938         kfree(csa);
 939
 940         /*
 941          * Fallback to the default domain if kmalloc() failed.
 942          * See comments in partition_sched_domains().
 943          */
 944         if (doms == NULL)
 945                 ndoms = 1;
 946
 947         *domains    = doms;
 948         *attributes = dattr;
 949         return ndoms;
 950 }
 951
 952 static void update_tasks_root_domain(struct cpuset *cs)
 953 {
 954         struct css_task_iter it;
 955         struct task_struct *task;
 956
 957         css_task_iter_start(&cs->css, 0, &it);
 958
 959         while ((task = css_task_iter_next(&it)))
 960                 dl_add_task_root_domain(task);
 961
 962         css_task_iter_end(&it);
 963 }
 964
 965 static void rebuild_root_domains(void)
 966 {
 967         struct cpuset *cs = NULL;
 968         struct cgroup_subsys_state *pos_css;
 969
 970         percpu_rwsem_assert_held(&cpuset_rwsem);
 971         lockdep_assert_cpus_held();
 972         lockdep_assert_held(&sched_domains_mutex);
 973
 974         rcu_read_lock();
 975
 976         /*
 977          * Clear default root domain DL accounting, it will be computed again
 978          * if a task belongs to it.
 979          */
 980         dl_clear_root_domain(&def_root_domain);
 981
 982         cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
 983
 984                 if (cpumask_empty(cs->effective_cpus)) {
 985                         pos_css = css_rightmost_descendant(pos_css);
 986                         continue;
 987                 }
 988
 989                 css_get(&cs->css);
 990
 991                 rcu_read_unlock();
 992
 993                 update_tasks_root_domain(cs);
 994
 995                 rcu_read_lock();
 996                 css_put(&cs->css);
 997         }
 998         rcu_read_unlock();
 999 }
1000
1001 static void
1002 partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
1003                                     struct sched_domain_attr *dattr_new)
1004 {
1005         mutex_lock(&sched_domains_mutex);
1006         partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
1007         rebuild_root_domains();
1008         mutex_unlock(&sched_domains_mutex);
1009 }
1010
1011 /*
1012  * Rebuild scheduler domains.
1013  *
1014  * If the flag 'sched_load_balance' of any cpuset with non-empty
1015  * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
1016  * which has that flag enabled, or if any cpuset with a non-empty
1017  * 'cpus' is removed, then call this routine to rebuild the
1018  * scheduler's dynamic sched domains.
1019  *
1020  * Call with cpuset_rwsem held.  Takes cpus_read_lock().
1021  */
1022 static void rebuild_sched_domains_locked(void)
1023 {
1024         struct cgroup_subsys_state *pos_css;
1025         struct sched_domain_attr *attr;
1026         cpumask_var_t *doms;
1027         struct cpuset *cs;
1028         int ndoms;
1029
1030         lockdep_assert_cpus_held();
1031         percpu_rwsem_assert_held(&cpuset_rwsem);
1032
1033         /*
1034          * If we have raced with CPU hotplug, return early to avoid
1035          * passing doms with offlined cpu to partition_sched_domains().
1036          * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
1037          *
1038          * With no CPUs in any subpartitions, top_cpuset's effective CPUs
1039          * should be the same as the active CPUs, so checking only top_cpuset
1040          * is enough to detect racing CPU offlines.
1041          */
1042         if (!top_cpuset.nr_subparts_cpus &&
1043             !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
1044                 return;
1045
1046         /*
1047          * With subpartition CPUs, however, the effective CPUs of a partition
1048          * root should be only a subset of the active CPUs.  Since a CPU in any
1049          * partition root could be offlined, all must be checked.
1050          */
1051         if (top_cpuset.nr_subparts_cpus) {
1052                 rcu_read_lock();
1053                 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
1054                         if (!is_partition_root(cs)) {
1055                                 pos_css = css_rightmost_descendant(pos_css);
1056                                 continue;
1057                         }
1058                         if (!cpumask_subset(cs->effective_cpus,
1059                                             cpu_active_mask)) {
1060                                 rcu_read_unlock();
1061                                 return;
1062                         }
1063                 }
1064                 rcu_read_unlock();
1065         }
1066
1067         /* Generate domain masks and attrs */
1068         ndoms = generate_sched_domains(&doms, &attr);
1069
1070         /* Have scheduler rebuild the domains */
1071         partition_and_rebuild_sched_domains(ndoms, doms, attr);
1072 }
1073 #else /* !CONFIG_SMP */
1074 static void rebuild_sched_domains_locked(void)
1075 {
1076 }
1077 #endif /* CONFIG_SMP */
1078
1079 void rebuild_sched_domains(void)
1080 {
1081         cpus_read_lock();
1082         percpu_down_write(&cpuset_rwsem);
1083         rebuild_sched_domains_locked();
1084         percpu_up_write(&cpuset_rwsem);
1085         cpus_read_unlock();
1086 }
1087
1088 /**
1089  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
1090  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
1091  *
1092  * Iterate through each task of @cs updating its cpus_allowed to the
1093  * effective cpuset's.  As this function is called with cpuset_rwsem held,
1094  * cpuset membership stays stable.
1095  */
1096 static void update_tasks_cpumask(struct cpuset *cs)
1097 {
1098         struct css_task_iter it;
1099         struct task_struct *task;
1100
1101         css_task_iter_start(&cs->css, 0, &it);
1102         while ((task = css_task_iter_next(&it)))
1103                 set_cpus_allowed_ptr(task, cs->effective_cpus);
1104         css_task_iter_end(&it);
1105 }
1106
1107 /**
1108  * compute_effective_cpumask - Compute the effective cpumask of the cpuset
1109  * @new_cpus: the temp variable for the new effective_cpus mask
1110  * @cs: the cpuset the need to recompute the new effective_cpus mask
1111  * @parent: the parent cpuset
1112  *
1113  * If the parent has subpartition CPUs, include them in the list of
1114  * allowable CPUs in computing the new effective_cpus mask. Since offlined
1115  * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
1116  * to mask those out.
1117  */
1118 static void compute_effective_cpumask(struct cpumask *new_cpus,
1119                                       struct cpuset *cs, struct cpuset *parent)
1120 {
1121         if (parent->nr_subparts_cpus) {
1122                 cpumask_or(new_cpus, parent->effective_cpus,
1123                            parent->subparts_cpus);
1124                 cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
1125                 cpumask_and(new_cpus, new_cpus, cpu_active_mask);
1126         } else {
1127                 cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
1128         }
1129 }
1130
1131 /*
1132  * Commands for update_parent_subparts_cpumask
1133  */
1134 enum subparts_cmd {
1135         partcmd_enable,         /* Enable partition root         */
1136         partcmd_disable,        /* Disable partition root        */
1137         partcmd_update,         /* Update parent's subparts_cpus */
1138 };
1139
1140 /**
1141  * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
1142  * @cpuset:  The cpuset that requests change in partition root state
1143  * @cmd:     Partition root state change command
1144  * @newmask: Optional new cpumask for partcmd_update
1145  * @tmp:     Temporary addmask and delmask
1146  * Return:   0, 1 or an error code
1147  *
1148  * For partcmd_enable, the cpuset is being transformed from a non-partition
1149  * root to a partition root. The cpus_allowed mask of the given cpuset will
1150  * be put into parent's subparts_cpus and taken away from parent's
1151  * effective_cpus. The function will return 0 if all the CPUs listed in
1152  * cpus_allowed can be granted or an error code will be returned.
1153  *
1154  * For partcmd_disable, the cpuset is being transofrmed from a partition
1155  * root back to a non-partition root. Any CPUs in cpus_allowed that are in
1156  * parent's subparts_cpus will be taken away from that cpumask and put back
1157  * into parent's effective_cpus. 0 should always be returned.
1158  *
1159  * For partcmd_update, if the optional newmask is specified, the cpu
1160  * list is to be changed from cpus_allowed to newmask. Otherwise,
1161  * cpus_allowed is assumed to remain the same. The cpuset should either
1162  * be a partition root or an invalid partition root. The partition root
1163  * state may change if newmask is NULL and none of the requested CPUs can
1164  * be granted by the parent. The function will return 1 if changes to
1165  * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
1166  * Error code should only be returned when newmask is non-NULL.
1167  *
1168  * The partcmd_enable and partcmd_disable commands are used by
1169  * update_prstate(). The partcmd_update command is used by
1170  * update_cpumasks_hier() with newmask NULL and update_cpumask() with
1171  * newmask set.
1172  *
1173  * The checking is more strict when enabling partition root than the
1174  * other two commands.
1175  *
1176  * Because of the implicit cpu exclusive nature of a partition root,
1177  * cpumask changes that violates the cpu exclusivity rule will not be
1178  * permitted when checked by validate_change(). The validate_change()
1179  * function will also prevent any changes to the cpu list if it is not
1180  * a superset of children's cpu lists.
1181  */
1182 static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
1183                                           struct cpumask *newmask,
1184                                           struct tmpmasks *tmp)
1185 {
1186         struct cpuset *parent = parent_cs(cpuset);
1187         int adding;     /* Moving cpus from effective_cpus to subparts_cpus */
1188         int deleting;   /* Moving cpus from subparts_cpus to effective_cpus */
1189         int old_prs, new_prs;
1190         bool part_error = false;        /* Partition error? */
1191
1192         percpu_rwsem_assert_held(&cpuset_rwsem);
1193
1194         /*
1195          * The parent must be a partition root.
1196          * The new cpumask, if present, or the current cpus_allowed must
1197          * not be empty.
1198          */
1199         if (!is_partition_root(parent) ||
1200            (newmask && cpumask_empty(newmask)) ||
1201            (!newmask && cpumask_empty(cpuset->cpus_allowed)))
1202                 return -EINVAL;
1203
1204         /*
1205          * Enabling/disabling partition root is not allowed if there are
1206          * online children.
1207          */
1208         if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
1209                 return -EBUSY;
1210
1211         /*
1212          * Enabling partition root is not allowed if not all the CPUs
1213          * can be granted from parent's effective_cpus or at least one
1214          * CPU will be left after that.
1215          */
1216         if ((cmd == partcmd_enable) &&
1217            (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
1218              cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
1219                 return -EINVAL;
1220
1221         /*
1222          * A cpumask update cannot make parent's effective_cpus become empty.
1223          */
1224         adding = deleting = false;
1225         old_prs = new_prs = cpuset->partition_root_state;
1226         if (cmd == partcmd_enable) {
1227                 cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
1228                 adding = true;
1229         } else if (cmd == partcmd_disable) {
1230                 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1231                                        parent->subparts_cpus);
1232         } else if (newmask) {
1233                 /*
1234                  * partcmd_update with newmask:
1235                  *
1236                  * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
1237                  * addmask = newmask & parent->effective_cpus
1238                  *                   & ~parent->subparts_cpus
1239                  */
1240                 cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
1241                 deleting = cpumask_and(tmp->delmask, tmp->delmask,
1242                                        parent->subparts_cpus);
1243
1244                 cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
1245                 adding = cpumask_andnot(tmp->addmask, tmp->addmask,
1246                                         parent->subparts_cpus);
1247                 /*
1248                  * Return error if the new effective_cpus could become empty.
1249                  */
1250                 if (adding &&
1251                     cpumask_equal(parent->effective_cpus, tmp->addmask)) {
1252                         if (!deleting)
1253                                 return -EINVAL;
1254                         /*
1255                          * As some of the CPUs in subparts_cpus might have
1256                          * been offlined, we need to compute the real delmask
1257                          * to confirm that.
1258                          */
1259                         if (!cpumask_and(tmp->addmask, tmp->delmask,
1260                                          cpu_active_mask))
1261                                 return -EINVAL;
1262                         cpumask_copy(tmp->addmask, parent->effective_cpus);
1263                 }
1264         } else {
1265                 /*
1266                  * partcmd_update w/o newmask:
1267                  *
1268                  * addmask = cpus_allowed & parent->effective_cpus
1269                  *
1270                  * Note that parent's subparts_cpus may have been
1271                  * pre-shrunk in case there is a change in the cpu list.
1272                  * So no deletion is needed.
1273                  */
1274                 adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
1275                                      parent->effective_cpus);
1276                 part_error = cpumask_equal(tmp->addmask,
1277                                            parent->effective_cpus);
1278         }
1279
1280         if (cmd == partcmd_update) {
1281                 int prev_prs = cpuset->partition_root_state;
1282
1283                 /*
1284                  * Check for possible transition between PRS_ENABLED
1285                  * and PRS_ERROR.
1286                  */
1287                 switch (cpuset->partition_root_state) {
1288                 case PRS_ENABLED:
1289                         if (part_error)
1290                                 new_prs = PRS_ERROR;
1291                         break;
1292                 case PRS_ERROR:
1293                         if (!part_error)
1294                                 new_prs = PRS_ENABLED;
1295                         break;
1296                 }
1297                 /*
1298                  * Set part_error if previously in invalid state.
1299                  */
1300                 part_error = (prev_prs == PRS_ERROR);
1301         }
1302
1303         if (!part_error && (new_prs == PRS_ERROR))
1304                 return 0;       /* Nothing need to be done */
1305
1306         if (new_prs == PRS_ERROR) {
1307                 /*
1308                  * Remove all its cpus from parent's subparts_cpus.
1309                  */
1310                 adding = false;
1311                 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1312                                        parent->subparts_cpus);
1313         }
1314
1315         if (!adding && !deleting && (new_prs == old_prs))
1316                 return 0;
1317
1318         /*
1319          * Change the parent's subparts_cpus.
1320          * Newly added CPUs will be removed from effective_cpus and
1321          * newly deleted ones will be added back to effective_cpus.
1322          */
1323         spin_lock_irq(&callback_lock);
1324         if (adding) {
1325                 cpumask_or(parent->subparts_cpus,
1326                            parent->subparts_cpus, tmp->addmask);
1327                 cpumask_andnot(parent->effective_cpus,
1328                                parent->effective_cpus, tmp->addmask);
1329         }
1330         if (deleting) {
1331                 cpumask_andnot(parent->subparts_cpus,
1332                                parent->subparts_cpus, tmp->delmask);
1333                 /*
1334                  * Some of the CPUs in subparts_cpus might have been offlined.
1335                  */
1336                 cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
1337                 cpumask_or(parent->effective_cpus,
1338                            parent->effective_cpus, tmp->delmask);
1339         }
1340
1341         parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
1342
1343         if (old_prs != new_prs)
1344                 cpuset->partition_root_state = new_prs;
1345
1346         spin_unlock_irq(&callback_lock);
1347         notify_partition_change(cpuset, old_prs, new_prs);
1348
1349         return cmd == partcmd_update;
1350 }
1351
1352 /*
1353  * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
1354  * @cs:  the cpuset to consider
1355  * @tmp: temp variables for calculating effective_cpus & partition setup
1356  *
1357  * When configured cpumask is changed, the effective cpumasks of this cpuset
1358  * and all its descendants need to be updated.
1359  *
1360  * On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
1361  *
1362  * Called with cpuset_rwsem held
1363  */
1364 static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
1365 {
1366         struct cpuset *cp;
1367         struct cgroup_subsys_state *pos_css;
1368         bool need_rebuild_sched_domains = false;
1369         int old_prs, new_prs;
1370
1371         rcu_read_lock();
1372         cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1373                 struct cpuset *parent = parent_cs(cp);
1374
1375                 compute_effective_cpumask(tmp->new_cpus, cp, parent);
1376
1377                 /*
1378                  * If it becomes empty, inherit the effective mask of the
1379                  * parent, which is guaranteed to have some CPUs.
1380                  */
1381                 if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
1382                         cpumask_copy(tmp->new_cpus, parent->effective_cpus);
1383                         if (!cp->use_parent_ecpus) {
1384                                 cp->use_parent_ecpus = true;
1385                                 parent->child_ecpus_count++;
1386                         }
1387                 } else if (cp->use_parent_ecpus) {
1388                         cp->use_parent_ecpus = false;
1389                         WARN_ON_ONCE(!parent->child_ecpus_count);
1390                         parent->child_ecpus_count--;
1391                 }
1392
1393                 /*
1394                  * Skip the whole subtree if the cpumask remains the same
1395                  * and has no partition root state.
1396                  */
1397                 if (!cp->partition_root_state &&
1398                     cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
1399                         pos_css = css_rightmost_descendant(pos_css);
1400                         continue;
1401                 }
1402
1403                 /*
1404                  * update_parent_subparts_cpumask() should have been called
1405                  * for cs already in update_cpumask(). We should also call
1406                  * update_tasks_cpumask() again for tasks in the parent
1407                  * cpuset if the parent's subparts_cpus changes.
1408                  */
1409                 old_prs = new_prs = cp->partition_root_state;
1410                 if ((cp != cs) && old_prs) {
1411                         switch (parent->partition_root_state) {
1412                         case PRS_DISABLED:
1413                                 /*
1414                                  * If parent is not a partition root or an
1415                                  * invalid partition root, clear its state
1416                                  * and its CS_CPU_EXCLUSIVE flag.
1417                                  */
1418                                 WARN_ON_ONCE(cp->partition_root_state
1419                                              != PRS_ERROR);
1420                                 new_prs = PRS_DISABLED;
1421
1422                                 /*
1423                                  * clear_bit() is an atomic operation and
1424                                  * readers aren't interested in the state
1425                                  * of CS_CPU_EXCLUSIVE anyway. So we can
1426                                  * just update the flag without holding
1427                                  * the callback_lock.
1428                                  */
1429                                 clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
1430                                 break;
1431
1432                         case PRS_ENABLED:
1433                                 if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
1434                                         update_tasks_cpumask(parent);
1435                                 break;
1436
1437                         case PRS_ERROR:
1438                                 /*
1439                                  * When parent is invalid, it has to be too.
1440                                  */
1441                                 new_prs = PRS_ERROR;
1442                                 break;
1443                         }
1444                 }
1445
1446                 if (!css_tryget_online(&cp->css))
1447                         continue;
1448                 rcu_read_unlock();
1449
1450                 spin_lock_irq(&callback_lock);
1451
1452                 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1453                 if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) {
1454                         cp->nr_subparts_cpus = 0;
1455                         cpumask_clear(cp->subparts_cpus);
1456                 } else if (cp->nr_subparts_cpus) {
1457                         /*
1458                          * Make sure that effective_cpus & subparts_cpus
1459                          * are mutually exclusive.
1460                          *
1461                          * In the unlikely event that effective_cpus
1462                          * becomes empty. we clear cp->nr_subparts_cpus and
1463                          * let its child partition roots to compete for
1464                          * CPUs again.
1465                          */
1466                         cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
1467                                        cp->subparts_cpus);
1468                         if (cpumask_empty(cp->effective_cpus)) {
1469                                 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1470                                 cpumask_clear(cp->subparts_cpus);
1471                                 cp->nr_subparts_cpus = 0;
1472                         } else if (!cpumask_subset(cp->subparts_cpus,
1473                                                    tmp->new_cpus)) {
1474                                 cpumask_andnot(cp->subparts_cpus,
1475                                         cp->subparts_cpus, tmp->new_cpus);
1476                                 cp->nr_subparts_cpus
1477                                         = cpumask_weight(cp->subparts_cpus);
1478                         }
1479                 }
1480
1481                 if (new_prs != old_prs)
1482                         cp->partition_root_state = new_prs;
1483
1484                 spin_unlock_irq(&callback_lock);
1485                 notify_partition_change(cp, old_prs, new_prs);
1486
1487                 WARN_ON(!is_in_v2_mode() &&
1488                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
1489
1490                 update_tasks_cpumask(cp);
1491
1492                 /*
1493                  * On legacy hierarchy, if the effective cpumask of any non-
1494                  * empty cpuset is changed, we need to rebuild sched domains.
1495                  * On default hierarchy, the cpuset needs to be a partition
1496                  * root as well.
1497                  */
1498                 if (!cpumask_empty(cp->cpus_allowed) &&
1499                     is_sched_load_balance(cp) &&
1500                    (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
1501                     is_partition_root(cp)))
1502                         need_rebuild_sched_domains = true;
1503
1504                 rcu_read_lock();
1505                 css_put(&cp->css);
1506         }
1507         rcu_read_unlock();
1508
1509         if (need_rebuild_sched_domains)
1510                 rebuild_sched_domains_locked();
1511 }
1512
1513 /**
1514  * update_sibling_cpumasks - Update siblings cpumasks
1515  * @parent:  Parent cpuset
1516  * @cs:      Current cpuset
1517  * @tmp:     Temp variables
1518  */
1519 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
1520                                     struct tmpmasks *tmp)
1521 {
1522         struct cpuset *sibling;
1523         struct cgroup_subsys_state *pos_css;
1524
1525         /*
1526          * Check all its siblings and call update_cpumasks_hier()
1527          * if their use_parent_ecpus flag is set in order for them
1528          * to use the right effective_cpus value.
1529          */
1530         rcu_read_lock();
1531         cpuset_for_each_child(sibling, pos_css, parent) {
1532                 if (sibling == cs)
1533                         continue;
1534                 if (!sibling->use_parent_ecpus)
1535                         continue;
1536
1537                 update_cpumasks_hier(sibling, tmp);
1538         }
1539         rcu_read_unlock();
1540 }
1541
1542 /**
1543  * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
1544  * @cs: the cpuset to consider
1545  * @trialcs: trial cpuset
1546  * @buf: buffer of cpu numbers written to this cpuset
1547  */
1548 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
1549                           const char *buf)
1550 {
1551         int retval;
1552         struct tmpmasks tmp;
1553
1554         /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
1555         if (cs == &top_cpuset)
1556                 return -EACCES;
1557
1558         /*
1559          * An empty cpus_allowed is ok only if the cpuset has no tasks.
1560          * Since cpulist_parse() fails on an empty mask, we special case
1561          * that parsing.  The validate_change() call ensures that cpusets
1562          * with tasks have cpus.
1563          */
1564         if (!*buf) {
1565                 cpumask_clear(trialcs->cpus_allowed);
1566         } else {
1567                 retval = cpulist_parse(buf, trialcs->cpus_allowed);
1568                 if (retval < 0)
1569                         return retval;
1570
1571                 if (!cpumask_subset(trialcs->cpus_allowed,
1572                                     top_cpuset.cpus_allowed))
1573                         return -EINVAL;
1574         }
1575
1576         /* Nothing to do if the cpus didn't change */
1577         if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
1578                 return 0;
1579
1580         retval = validate_change(cs, trialcs);
1581         if (retval < 0)
1582                 return retval;
1583
1584 #ifdef CONFIG_CPUMASK_OFFSTACK
1585         /*
1586          * Use the cpumasks in trialcs for tmpmasks when they are pointers
1587          * to allocated cpumasks.
1588          */
1589         tmp.addmask  = trialcs->subparts_cpus;
1590         tmp.delmask  = trialcs->effective_cpus;
1591         tmp.new_cpus = trialcs->cpus_allowed;
1592 #endif
1593
1594         if (cs->partition_root_state) {
1595                 /* Cpumask of a partition root cannot be empty */
1596                 if (cpumask_empty(trialcs->cpus_allowed))
1597                         return -EINVAL;
1598                 if (update_parent_subparts_cpumask(cs, partcmd_update,
1599                                         trialcs->cpus_allowed, &tmp) < 0)
1600                         return -EINVAL;
1601         }
1602
1603         spin_lock_irq(&callback_lock);
1604         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
1605
1606         /*
1607          * Make sure that subparts_cpus is a subset of cpus_allowed.
1608          */
1609         if (cs->nr_subparts_cpus) {
1610                 cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
1611                                cs->cpus_allowed);
1612                 cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
1613         }
1614         spin_unlock_irq(&callback_lock);
1615
1616         update_cpumasks_hier(cs, &tmp);
1617
1618         if (cs->partition_root_state) {
1619                 struct cpuset *parent = parent_cs(cs);
1620
1621                 /*
1622                  * For partition root, update the cpumasks of sibling
1623                  * cpusets if they use parent's effective_cpus.
1624                  */
1625                 if (parent->child_ecpus_count)
1626                         update_sibling_cpumasks(parent, cs, &tmp);
1627         }
1628         return 0;
1629 }
1630
1631 /*
1632  * Migrate memory region from one set of nodes to another.  This is
1633  * performed asynchronously as it can be called from process migration path
1634  * holding locks involved in process management.  All mm migrations are
1635  * performed in the queued order and can be waited for by flushing
1636  * cpuset_migrate_mm_wq.
1637  */
1638
1639 struct cpuset_migrate_mm_work {
1640         struct work_struct      work;
1641         struct mm_struct        *mm;
1642         nodemask_t              from;
1643         nodemask_t              to;
1644 };
1645
1646 static void cpuset_migrate_mm_workfn(struct work_struct *work)
1647 {
1648         struct cpuset_migrate_mm_work *mwork =
1649                 container_of(work, struct cpuset_migrate_mm_work, work);
1650
1651         /* on a wq worker, no need to worry about %current's mems_allowed */
1652         do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
1653         mmput(mwork->mm);
1654         kfree(mwork);
1655 }
1656
1657 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1658                                                         const nodemask_t *to)
1659 {
1660         struct cpuset_migrate_mm_work *mwork;
1661
1662         if (nodes_equal(*from, *to)) {
1663                 mmput(mm);
1664                 return;
1665         }
1666
1667         mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
1668         if (mwork) {
1669                 mwork->mm = mm;
1670                 mwork->from = *from;
1671                 mwork->to = *to;
1672                 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
1673                 queue_work(cpuset_migrate_mm_wq, &mwork->work);
1674         } else {
1675                 mmput(mm);
1676         }
1677 }
1678
1679 static void cpuset_post_attach(void)
1680 {
1681         flush_workqueue(cpuset_migrate_mm_wq);
1682 }
1683
1684 /*
1685  * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
1686  * @tsk: the task to change
1687  * @newmems: new nodes that the task will be set
1688  *
1689  * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
1690  * and rebind an eventual tasks' mempolicy. If the task is allocating in
1691  * parallel, it might temporarily see an empty intersection, which results in
1692  * a seqlock check and retry before OOM or allocation failure.
1693  */
1694 static void cpuset_change_task_nodemask(struct task_struct *tsk,
1695                                         nodemask_t *newmems)
1696 {
1697         task_lock(tsk);
1698
1699         local_irq_disable();
1700         write_seqcount_begin(&tsk->mems_allowed_seq);
1701
1702         nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1703         mpol_rebind_task(tsk, newmems);
1704         tsk->mems_allowed = *newmems;
1705
1706         write_seqcount_end(&tsk->mems_allowed_seq);
1707         local_irq_enable();
1708
1709         task_unlock(tsk);
1710 }
1711
1712 static void *cpuset_being_rebound;
1713
1714 /**
1715  * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1716  * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1717  *
1718  * Iterate through each task of @cs updating its mems_allowed to the
1719  * effective cpuset's.  As this function is called with cpuset_rwsem held,
1720  * cpuset membership stays stable.
1721  */
1722 static void update_tasks_nodemask(struct cpuset *cs)
1723 {
1724         static nodemask_t newmems;      /* protected by cpuset_rwsem */
1725         struct css_task_iter it;
1726         struct task_struct *task;
1727
1728         cpuset_being_rebound = cs;              /* causes mpol_dup() rebind */
1729
1730         guarantee_online_mems(cs, &newmems);
1731
1732         /*
1733          * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
1734          * take while holding tasklist_lock.  Forks can happen - the
1735          * mpol_dup() cpuset_being_rebound check will catch such forks,
1736          * and rebind their vma mempolicies too.  Because we still hold
1737          * the global cpuset_rwsem, we know that no other rebind effort
1738          * will be contending for the global variable cpuset_being_rebound.
1739          * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1740          * is idempotent.  Also migrate pages in each mm to new nodes.
1741          */
1742         css_task_iter_start(&cs->css, 0, &it);
1743         while ((task = css_task_iter_next(&it))) {
1744                 struct mm_struct *mm;
1745                 bool migrate;
1746
1747                 cpuset_change_task_nodemask(task, &newmems);
1748
1749                 mm = get_task_mm(task);
1750                 if (!mm)
1751                         continue;
1752
1753                 migrate = is_memory_migrate(cs);
1754
1755                 mpol_rebind_mm(mm, &cs->mems_allowed);
1756                 if (migrate)
1757                         cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1758                 else
1759                         mmput(mm);
1760         }
1761         css_task_iter_end(&it);
1762
1763         /*
1764          * All the tasks' nodemasks have been updated, update
1765          * cs->old_mems_allowed.
1766          */
1767         cs->old_mems_allowed = newmems;
1768
1769         /* We're done rebinding vmas to this cpuset's new mems_allowed. */
1770         cpuset_being_rebound = NULL;
1771 }
1772
1773 /*
1774  * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
1775  * @cs: the cpuset to consider
1776  * @new_mems: a temp variable for calculating new effective_mems
1777  *
1778  * When configured nodemask is changed, the effective nodemasks of this cpuset
1779  * and all its descendants need to be updated.
1780  *
1781  * On legacy hierarchy, effective_mems will be the same with mems_allowed.
1782  *
1783  * Called with cpuset_rwsem held
1784  */
1785 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1786 {
1787         struct cpuset *cp;
1788         struct cgroup_subsys_state *pos_css;
1789
1790         rcu_read_lock();
1791         cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1792                 struct cpuset *parent = parent_cs(cp);
1793
1794                 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1795
1796                 /*
1797                  * If it becomes empty, inherit the effective mask of the
1798                  * parent, which is guaranteed to have some MEMs.
1799                  */
1800                 if (is_in_v2_mode() && nodes_empty(*new_mems))
1801                         *new_mems = parent->effective_mems;
1802
1803                 /* Skip the whole subtree if the nodemask remains the same. */
1804                 if (nodes_equal(*new_mems, cp->effective_mems)) {
1805                         pos_css = css_rightmost_descendant(pos_css);
1806                         continue;
1807                 }
1808
1809                 if (!css_tryget_online(&cp->css))
1810                         continue;
1811                 rcu_read_unlock();
1812
1813                 spin_lock_irq(&callback_lock);
1814                 cp->effective_mems = *new_mems;
1815                 spin_unlock_irq(&callback_lock);
1816
1817                 WARN_ON(!is_in_v2_mode() &&
1818                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
1819
1820                 update_tasks_nodemask(cp);
1821
1822                 rcu_read_lock();
1823                 css_put(&cp->css);
1824         }
1825         rcu_read_unlock();
1826 }
1827
1828 /*
1829  * Handle user request to change the 'mems' memory placement
1830  * of a cpuset.  Needs to validate the request, update the
1831  * cpusets mems_allowed, and for each task in the cpuset,
1832  * update mems_allowed and rebind task's mempolicy and any vma
1833  * mempolicies and if the cpuset is marked 'memory_migrate',
1834  * migrate the tasks pages to the new memory.
1835  *
1836  * Call with cpuset_rwsem held. May take callback_lock during call.
1837  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1838  * lock each such tasks mm->mmap_lock, scan its vma's and rebind
1839  * their mempolicies to the cpusets new mems_allowed.
1840  */
1841 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1842                            const char *buf)
1843 {
1844         int retval;
1845
1846         /*
1847          * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
1848          * it's read-only
1849          */
1850         if (cs == &top_cpuset) {
1851                 retval = -EACCES;
1852                 goto done;
1853         }
1854
1855         /*
1856          * An empty mems_allowed is ok iff there are no tasks in the cpuset.
1857          * Since nodelist_parse() fails on an empty mask, we special case
1858          * that parsing.  The validate_change() call ensures that cpusets
1859          * with tasks have memory.
1860          */
1861         if (!*buf) {
1862                 nodes_clear(trialcs->mems_allowed);
1863         } else {
1864                 retval = nodelist_parse(buf, trialcs->mems_allowed);
1865                 if (retval < 0)
1866                         goto done;
1867
1868                 if (!nodes_subset(trialcs->mems_allowed,
1869                                   top_cpuset.mems_allowed)) {
1870                         retval = -EINVAL;
1871                         goto done;
1872                 }
1873         }
1874
1875         if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1876                 retval = 0;             /* Too easy - nothing to do */
1877                 goto done;
1878         }
1879         retval = validate_change(cs, trialcs);
1880         if (retval < 0)
1881                 goto done;
1882
1883         check_insane_mems_config(&trialcs->mems_allowed);
1884
1885         spin_lock_irq(&callback_lock);
1886         cs->mems_allowed = trialcs->mems_allowed;
1887         spin_unlock_irq(&callback_lock);
1888
1889         /* use trialcs->mems_allowed as a temp variable */
1890         update_nodemasks_hier(cs, &trialcs->mems_allowed);
1891 done:
1892         return retval;
1893 }
1894
1895 bool current_cpuset_is_being_rebound(void)
1896 {
1897         bool ret;
1898
1899         rcu_read_lock();
1900         ret = task_cs(current) == cpuset_being_rebound;
1901         rcu_read_unlock();
1902
1903         return ret;
1904 }
1905
1906 static int update_relax_domain_level(struct cpuset *cs, s64 val)
1907 {
1908 #ifdef CONFIG_SMP
1909         if (val < -1 || val >= sched_domain_level_max)
1910                 return -EINVAL;
1911 #endif
1912
1913         if (val != cs->relax_domain_level) {
1914                 cs->relax_domain_level = val;
1915                 if (!cpumask_empty(cs->cpus_allowed) &&
1916                     is_sched_load_balance(cs))
1917                         rebuild_sched_domains_locked();
1918         }
1919
1920         return 0;
1921 }
1922
1923 /**
1924  * update_tasks_flags - update the spread flags of tasks in the cpuset.
1925  * @cs: the cpuset in which each task's spread flags needs to be changed
1926  *
1927  * Iterate through each task of @cs updating its spread flags.  As this
1928  * function is called with cpuset_rwsem held, cpuset membership stays
1929  * stable.
1930  */
1931 static void update_tasks_flags(struct cpuset *cs)
1932 {
1933         struct css_task_iter it;
1934         struct task_struct *task;
1935
1936         css_task_iter_start(&cs->css, 0, &it);
1937         while ((task = css_task_iter_next(&it)))
1938                 cpuset_update_task_spread_flag(cs, task);
1939         css_task_iter_end(&it);
1940 }
1941
1942 /*
1943  * update_flag - read a 0 or a 1 in a file and update associated flag
1944  * bit:         the bit to update (see cpuset_flagbits_t)
1945  * cs:          the cpuset to update
1946  * turning_on:  whether the flag is being set or cleared
1947  *
1948  * Call with cpuset_rwsem held.
1949  */
1950
1951 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1952                        int turning_on)
1953 {
1954         struct cpuset *trialcs;
1955         int balance_flag_changed;
1956         int spread_flag_changed;
1957         int err;
1958
1959         trialcs = alloc_trial_cpuset(cs);
1960         if (!trialcs)
1961                 return -ENOMEM;
1962
1963         if (turning_on)
1964                 set_bit(bit, &trialcs->flags);
1965         else
1966                 clear_bit(bit, &trialcs->flags);
1967
1968         err = validate_change(cs, trialcs);
1969         if (err < 0)
1970                 goto out;
1971
1972         balance_flag_changed = (is_sched_load_balance(cs) !=
1973                                 is_sched_load_balance(trialcs));
1974
1975         spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1976                         || (is_spread_page(cs) != is_spread_page(trialcs)));
1977
1978         spin_lock_irq(&callback_lock);
1979         cs->flags = trialcs->flags;
1980         spin_unlock_irq(&callback_lock);
1981
1982         if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1983                 rebuild_sched_domains_locked();
1984
1985         if (spread_flag_changed)
1986                 update_tasks_flags(cs);
1987 out:
1988         free_cpuset(trialcs);
1989         return err;
1990 }
1991
1992 /*
1993  * update_prstate - update partititon_root_state
1994  * cs: the cpuset to update
1995  * new_prs: new partition root state
1996  *
1997  * Call with cpuset_rwsem held.
1998  */
1999 static int update_prstate(struct cpuset *cs, int new_prs)
2000 {
2001         int err, old_prs = cs->partition_root_state;
2002         struct cpuset *parent = parent_cs(cs);
2003         struct tmpmasks tmpmask;
2004
2005         if (old_prs == new_prs)
2006                 return 0;
2007
2008         /*
2009          * Cannot force a partial or invalid partition root to a full
2010          * partition root.
2011          */
2012         if (new_prs && (old_prs == PRS_ERROR))
2013                 return -EINVAL;
2014
2015         if (alloc_cpumasks(NULL, &tmpmask))
2016                 return -ENOMEM;
2017
2018         err = -EINVAL;
2019         if (!old_prs) {
2020                 /*
2021                  * Turning on partition root requires setting the
2022                  * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
2023                  * cannot be NULL.
2024                  */
2025                 if (cpumask_empty(cs->cpus_allowed))
2026                         goto out;
2027
2028                 err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
2029                 if (err)
2030                         goto out;
2031
2032                 err = update_parent_subparts_cpumask(cs, partcmd_enable,
2033                                                      NULL, &tmpmask);
2034                 if (err) {
2035                         update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2036                         goto out;
2037                 }
2038         } else {
2039                 /*
2040                  * Turning off partition root will clear the
2041                  * CS_CPU_EXCLUSIVE bit.
2042                  */
2043                 if (old_prs == PRS_ERROR) {
2044                         update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2045                         err = 0;
2046                         goto out;
2047                 }
2048
2049                 err = update_parent_subparts_cpumask(cs, partcmd_disable,
2050                                                      NULL, &tmpmask);
2051                 if (err)
2052                         goto out;
2053
2054                 /* Turning off CS_CPU_EXCLUSIVE will not return error */
2055                 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
2056         }
2057
2058         /*
2059          * Update cpumask of parent's tasks except when it is the top
2060          * cpuset as some system daemons cannot be mapped to other CPUs.
2061          */
2062         if (parent != &top_cpuset)
2063                 update_tasks_cpumask(parent);
2064
2065         if (parent->child_ecpus_count)
2066                 update_sibling_cpumasks(parent, cs, &tmpmask);
2067
2068         rebuild_sched_domains_locked();
2069 out:
2070         if (!err) {
2071                 spin_lock_irq(&callback_lock);
2072                 cs->partition_root_state = new_prs;
2073                 spin_unlock_irq(&callback_lock);
2074                 notify_partition_change(cs, old_prs, new_prs);
2075         }
2076
2077         free_cpumasks(NULL, &tmpmask);
2078         return err;
2079 }
2080
2081 /*
2082  * Frequency meter - How fast is some event occurring?
2083  *
2084  * These routines manage a digitally filtered, constant time based,
2085  * event frequency meter.  There are four routines:
2086  *   fmeter_init() - initialize a frequency meter.
2087  *   fmeter_markevent() - called each time the event happens.
2088  *   fmeter_getrate() - returns the recent rate of such events.
2089  *   fmeter_update() - internal routine used to update fmeter.
2090  *
2091  * A common data structure is passed to each of these routines,
2092  * which is used to keep track of the state required to manage the
2093  * frequency meter and its digital filter.
2094  *
2095  * The filter works on the number of events marked per unit time.
2096  * The filter is single-pole low-pass recursive (IIR).  The time unit
2097  * is 1 second.  Arithmetic is done using 32-bit integers scaled to
2098  * simulate 3 decimal digits of precision (multiplied by 1000).
2099  *
2100  * With an FM_COEF of 933, and a time base of 1 second, the filter
2101  * has a half-life of 10 seconds, meaning that if the events quit
2102  * happening, then the rate returned from the fmeter_getrate()
2103  * will be cut in half each 10 seconds, until it converges to zero.
2104  *
2105  * It is not worth doing a real infinitely recursive filter.  If more
2106  * than FM_MAXTICKS ticks have elapsed since the last filter event,
2107  * just compute FM_MAXTICKS ticks worth, by which point the level
2108  * will be stable.
2109  *
2110  * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
2111  * arithmetic overflow in the fmeter_update() routine.
2112  *
2113  * Given the simple 32 bit integer arithmetic used, this meter works
2114  * best for reporting rates between one per millisecond (msec) and
2115  * one per 32 (approx) seconds.  At constant rates faster than one
2116  * per msec it maxes out at values just under 1,000,000.  At constant
2117  * rates between one per msec, and one per second it will stabilize
2118  * to a value N*1000, where N is the rate of events per second.
2119  * At constant rates between one per second and one per 32 seconds,
2120  * it will be choppy, moving up on the seconds that have an event,
2121  * and then decaying until the next event.  At rates slower than
2122  * about one in 32 seconds, it decays all the way back to zero between
2123  * each event.
2124  */
2125
2126 #define FM_COEF 933             /* coefficient for half-life of 10 secs */
2127 #define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
2128 #define FM_MAXCNT 1000000       /* limit cnt to avoid overflow */
2129 #define FM_SCALE 1000           /* faux fixed point scale */
2130
2131 /* Initialize a frequency meter */
2132 static void fmeter_init(struct fmeter *fmp)
2133 {
2134         fmp->cnt = 0;
2135         fmp->val = 0;
2136         fmp->time = 0;
2137         spin_lock_init(&fmp->lock);
2138 }
2139
2140 /* Internal meter update - process cnt events and update value */
2141 static void fmeter_update(struct fmeter *fmp)
2142 {
2143         time64_t now;
2144         u32 ticks;
2145
2146         now = ktime_get_seconds();
2147         ticks = now - fmp->time;
2148
2149         if (ticks == 0)
2150                 return;
2151
2152         ticks = min(FM_MAXTICKS, ticks);
2153         while (ticks-- > 0)
2154                 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
2155         fmp->time = now;
2156
2157         fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
2158         fmp->cnt = 0;
2159 }
2160
2161 /* Process any previous ticks, then bump cnt by one (times scale). */
2162 static void fmeter_markevent(struct fmeter *fmp)
2163 {
2164         spin_lock(&fmp->lock);
2165         fmeter_update(fmp);
2166         fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
2167         spin_unlock(&fmp->lock);
2168 }
2169
2170 /* Process any previous ticks, then return current value. */
2171 static int fmeter_getrate(struct fmeter *fmp)
2172 {
2173         int val;
2174
2175         spin_lock(&fmp->lock);
2176         fmeter_update(fmp);
2177         val = fmp->val;
2178         spin_unlock(&fmp->lock);
2179         return val;
2180 }
2181
2182 static struct cpuset *cpuset_attach_old_cs;
2183
2184 /* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
2185 static int cpuset_can_attach(struct cgroup_taskset *tset)
2186 {
2187         struct cgroup_subsys_state *css;
2188         struct cpuset *cs;
2189         struct task_struct *task;
2190         int ret;
2191
2192         /* used later by cpuset_attach() */
2193         cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
2194         cs = css_cs(css);
2195
2196         percpu_down_write(&cpuset_rwsem);
2197
2198         /* allow moving tasks into an empty cpuset if on default hierarchy */
2199         ret = -ENOSPC;
2200         if (!is_in_v2_mode() &&
2201             (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
2202                 goto out_unlock;
2203
2204         cgroup_taskset_for_each(task, css, tset) {
2205                 ret = task_can_attach(task, cs->cpus_allowed);
2206                 if (ret)
2207                         goto out_unlock;
2208                 ret = security_task_setscheduler(task);
2209                 if (ret)
2210                         goto out_unlock;
2211         }
2212
2213         /*
2214          * Mark attach is in progress.  This makes validate_change() fail
2215          * changes which zero cpus/mems_allowed.
2216          */
2217         cs->attach_in_progress++;
2218         ret = 0;
2219 out_unlock:
2220         percpu_up_write(&cpuset_rwsem);
2221         return ret;
2222 }
2223
2224 static void cpuset_cancel_attach(struct cgroup_taskset *tset)
2225 {
2226         struct cgroup_subsys_state *css;
2227
2228         cgroup_taskset_first(tset, &css);
2229
2230         percpu_down_write(&cpuset_rwsem);
2231         css_cs(css)->attach_in_progress--;
2232         percpu_up_write(&cpuset_rwsem);
2233 }
2234
2235 /*
2236  * Protected by cpuset_rwsem.  cpus_attach is used only by cpuset_attach()
2237  * but we can't allocate it dynamically there.  Define it global and
2238  * allocate from cpuset_init().
2239  */
2240 static cpumask_var_t cpus_attach;
2241
2242 static void cpuset_attach(struct cgroup_taskset *tset)
2243 {
2244         /* static buf protected by cpuset_rwsem */
2245         static nodemask_t cpuset_attach_nodemask_to;
2246         struct task_struct *task;
2247         struct task_struct *leader;
2248         struct cgroup_subsys_state *css;
2249         struct cpuset *cs;
2250         struct cpuset *oldcs = cpuset_attach_old_cs;
2251
2252         cgroup_taskset_first(tset, &css);
2253         cs = css_cs(css);
2254
2255         percpu_down_write(&cpuset_rwsem);
2256
2257         guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
2258
2259         cgroup_taskset_for_each(task, css, tset) {
2260                 if (cs != &top_cpuset)
2261                         guarantee_online_cpus(task, cpus_attach);
2262                 else
2263                         cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
2264                 /*
2265                  * can_attach beforehand should guarantee that this doesn't
2266                  * fail.  TODO: have a better way to handle failure here
2267                  */
2268                 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
2269
2270                 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
2271                 cpuset_update_task_spread_flag(cs, task);
2272         }
2273
2274         /*
2275          * Change mm for all threadgroup leaders. This is expensive and may
2276          * sleep and should be moved outside migration path proper.
2277          */
2278         cpuset_attach_nodemask_to = cs->effective_mems;
2279         cgroup_taskset_for_each_leader(leader, css, tset) {
2280                 struct mm_struct *mm = get_task_mm(leader);
2281
2282                 if (mm) {
2283                         mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
2284
2285                         /*
2286                          * old_mems_allowed is the same with mems_allowed
2287                          * here, except if this task is being moved
2288                          * automatically due to hotplug.  In that case
2289                          * @mems_allowed has been updated and is empty, so
2290                          * @old_mems_allowed is the right nodesets that we
2291                          * migrate mm from.
2292                          */
2293                         if (is_memory_migrate(cs))
2294                                 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
2295                                                   &cpuset_attach_nodemask_to);
2296                         else
2297                                 mmput(mm);
2298                 }
2299         }
2300
2301         cs->old_mems_allowed = cpuset_attach_nodemask_to;
2302
2303         cs->attach_in_progress--;
2304         if (!cs->attach_in_progress)
2305                 wake_up(&cpuset_attach_wq);
2306
2307         percpu_up_write(&cpuset_rwsem);
2308 }
2309
2310 /* The various types of files and directories in a cpuset file system */
2311
2312 typedef enum {
2313         FILE_MEMORY_MIGRATE,
2314         FILE_CPULIST,
2315         FILE_MEMLIST,
2316         FILE_EFFECTIVE_CPULIST,
2317         FILE_EFFECTIVE_MEMLIST,
2318         FILE_SUBPARTS_CPULIST,
2319         FILE_CPU_EXCLUSIVE,
2320         FILE_MEM_EXCLUSIVE,
2321         FILE_MEM_HARDWALL,
2322         FILE_SCHED_LOAD_BALANCE,
2323         FILE_PARTITION_ROOT,
2324         FILE_SCHED_RELAX_DOMAIN_LEVEL,
2325         FILE_MEMORY_PRESSURE_ENABLED,
2326         FILE_MEMORY_PRESSURE,
2327         FILE_SPREAD_PAGE,
2328         FILE_SPREAD_SLAB,
2329 } cpuset_filetype_t;
2330
2331 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
2332                             u64 val)
2333 {
2334         struct cpuset *cs = css_cs(css);
2335         cpuset_filetype_t type = cft->private;
2336         int retval = 0;
2337
2338         cpus_read_lock();
2339         percpu_down_write(&cpuset_rwsem);
2340         if (!is_cpuset_online(cs)) {
2341                 retval = -ENODEV;
2342                 goto out_unlock;
2343         }
2344
2345         switch (type) {
2346         case FILE_CPU_EXCLUSIVE:
2347                 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
2348                 break;
2349         case FILE_MEM_EXCLUSIVE:
2350                 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
2351                 break;
2352         case FILE_MEM_HARDWALL:
2353                 retval = update_flag(CS_MEM_HARDWALL, cs, val);
2354                 break;
2355         case FILE_SCHED_LOAD_BALANCE:
2356                 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
2357                 break;
2358         case FILE_MEMORY_MIGRATE:
2359                 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
2360                 break;
2361         case FILE_MEMORY_PRESSURE_ENABLED:
2362                 cpuset_memory_pressure_enabled = !!val;
2363                 break;
2364         case FILE_SPREAD_PAGE:
2365                 retval = update_flag(CS_SPREAD_PAGE, cs, val);
2366                 break;
2367         case FILE_SPREAD_SLAB:
2368                 retval = update_flag(CS_SPREAD_SLAB, cs, val);
2369                 break;
2370         default:
2371                 retval = -EINVAL;
2372                 break;
2373         }
2374 out_unlock:
2375         percpu_up_write(&cpuset_rwsem);
2376         cpus_read_unlock();
2377         return retval;
2378 }
2379
2380 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
2381                             s64 val)
2382 {
2383         struct cpuset *cs = css_cs(css);
2384         cpuset_filetype_t type = cft->private;
2385         int retval = -ENODEV;
2386
2387         cpus_read_lock();
2388         percpu_down_write(&cpuset_rwsem);
2389         if (!is_cpuset_online(cs))
2390                 goto out_unlock;
2391
2392         switch (type) {
2393         case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2394                 retval = update_relax_domain_level(cs, val);
2395                 break;
2396         default:
2397                 retval = -EINVAL;
2398                 break;
2399         }
2400 out_unlock:
2401         percpu_up_write(&cpuset_rwsem);
2402         cpus_read_unlock();
2403         return retval;
2404 }
2405
2406 /*
2407  * Common handling for a write to a "cpus" or "mems" file.
2408  */
2409 static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
2410                                     char *buf, size_t nbytes, loff_t off)
2411 {
2412         struct cpuset *cs = css_cs(of_css(of));
2413         struct cpuset *trialcs;
2414         int retval = -ENODEV;
2415
2416         buf = strstrip(buf);
2417
2418         /*
2419          * CPU or memory hotunplug may leave @cs w/o any execution
2420          * resources, in which case the hotplug code asynchronously updates
2421          * configuration and transfers all tasks to the nearest ancestor
2422          * which can execute.
2423          *
2424          * As writes to "cpus" or "mems" may restore @cs's execution
2425          * resources, wait for the previously scheduled operations before
2426          * proceeding, so that we don't end up keep removing tasks added
2427          * after execution capability is restored.
2428          *
2429          * cpuset_hotplug_work calls back into cgroup core via
2430          * cgroup_transfer_tasks() and waiting for it from a cgroupfs
2431          * operation like this one can lead to a deadlock through kernfs
2432          * active_ref protection.  Let's break the protection.  Losing the
2433          * protection is okay as we check whether @cs is online after
2434          * grabbing cpuset_rwsem anyway.  This only happens on the legacy
2435          * hierarchies.
2436          */
2437         css_get(&cs->css);
2438         kernfs_break_active_protection(of->kn);
2439         flush_work(&cpuset_hotplug_work);
2440
2441         cpus_read_lock();
2442         percpu_down_write(&cpuset_rwsem);
2443         if (!is_cpuset_online(cs))
2444                 goto out_unlock;
2445
2446         trialcs = alloc_trial_cpuset(cs);
2447         if (!trialcs) {
2448                 retval = -ENOMEM;
2449                 goto out_unlock;
2450         }
2451
2452         switch (of_cft(of)->private) {
2453         case FILE_CPULIST:
2454                 retval = update_cpumask(cs, trialcs, buf);
2455                 break;
2456         case FILE_MEMLIST:
2457                 retval = update_nodemask(cs, trialcs, buf);
2458                 break;
2459         default:
2460                 retval = -EINVAL;
2461                 break;
2462         }
2463
2464         free_cpuset(trialcs);
2465 out_unlock:
2466         percpu_up_write(&cpuset_rwsem);
2467         cpus_read_unlock();
2468         kernfs_unbreak_active_protection(of->kn);
2469         css_put(&cs->css);
2470         flush_workqueue(cpuset_migrate_mm_wq);
2471         return retval ?: nbytes;
2472 }
2473
2474 /*
2475  * These ascii lists should be read in a single call, by using a user
2476  * buffer large enough to hold the entire map.  If read in smaller
2477  * chunks, there is no guarantee of atomicity.  Since the display format
2478  * used, list of ranges of sequential numbers, is variable length,
2479  * and since these maps can change value dynamically, one could read
2480  * gibberish by doing partial reads while a list was changing.
2481  */
2482 static int cpuset_common_seq_show(struct seq_file *sf, void *v)
2483 {
2484         struct cpuset *cs = css_cs(seq_css(sf));
2485         cpuset_filetype_t type = seq_cft(sf)->private;
2486         int ret = 0;
2487
2488         spin_lock_irq(&callback_lock);
2489
2490         switch (type) {
2491         case FILE_CPULIST:
2492                 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
2493                 break;
2494         case FILE_MEMLIST:
2495                 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
2496                 break;
2497         case FILE_EFFECTIVE_CPULIST:
2498                 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
2499                 break;
2500         case FILE_EFFECTIVE_MEMLIST:
2501                 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
2502                 break;
2503         case FILE_SUBPARTS_CPULIST:
2504                 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
2505                 break;
2506         default:
2507                 ret = -EINVAL;
2508         }
2509
2510         spin_unlock_irq(&callback_lock);
2511         return ret;
2512 }
2513
2514 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
2515 {
2516         struct cpuset *cs = css_cs(css);
2517         cpuset_filetype_t type = cft->private;
2518         switch (type) {
2519         case FILE_CPU_EXCLUSIVE:
2520                 return is_cpu_exclusive(cs);
2521         case FILE_MEM_EXCLUSIVE:
2522                 return is_mem_exclusive(cs);
2523         case FILE_MEM_HARDWALL:
2524                 return is_mem_hardwall(cs);
2525         case FILE_SCHED_LOAD_BALANCE:
2526                 return is_sched_load_balance(cs);
2527         case FILE_MEMORY_MIGRATE:
2528                 return is_memory_migrate(cs);
2529         case FILE_MEMORY_PRESSURE_ENABLED:
2530                 return cpuset_memory_pressure_enabled;
2531         case FILE_MEMORY_PRESSURE:
2532                 return fmeter_getrate(&cs->fmeter);
2533         case FILE_SPREAD_PAGE:
2534                 return is_spread_page(cs);
2535         case FILE_SPREAD_SLAB:
2536                 return is_spread_slab(cs);
2537         default:
2538                 BUG();
2539         }
2540
2541         /* Unreachable but makes gcc happy */
2542         return 0;
2543 }
2544
2545 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
2546 {
2547         struct cpuset *cs = css_cs(css);
2548         cpuset_filetype_t type = cft->private;
2549         switch (type) {
2550         case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2551                 return cs->relax_domain_level;
2552         default:
2553                 BUG();
2554         }
2555
2556         /* Unreachable but makes gcc happy */
2557         return 0;
2558 }
2559
2560 static int sched_partition_show(struct seq_file *seq, void *v)
2561 {
2562         struct cpuset *cs = css_cs(seq_css(seq));
2563
2564         switch (cs->partition_root_state) {
2565         case PRS_ENABLED:
2566                 seq_puts(seq, "root\n");
2567                 break;
2568         case PRS_DISABLED:
2569                 seq_puts(seq, "member\n");
2570                 break;
2571         case PRS_ERROR:
2572                 seq_puts(seq, "root invalid\n");
2573                 break;
2574         }
2575         return 0;
2576 }
2577
2578 static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
2579                                      size_t nbytes, loff_t off)
2580 {
2581         struct cpuset *cs = css_cs(of_css(of));
2582         int val;
2583         int retval = -ENODEV;
2584
2585         buf = strstrip(buf);
2586
2587         /*
2588          * Convert "root" to ENABLED, and convert "member" to DISABLED.
2589          */
2590         if (!strcmp(buf, "root"))
2591                 val = PRS_ENABLED;
2592         else if (!strcmp(buf, "member"))
2593                 val = PRS_DISABLED;
2594         else
2595                 return -EINVAL;
2596
2597         css_get(&cs->css);
2598         cpus_read_lock();
2599         percpu_down_write(&cpuset_rwsem);
2600         if (!is_cpuset_online(cs))
2601                 goto out_unlock;
2602
2603         retval = update_prstate(cs, val);
2604 out_unlock:
2605         percpu_up_write(&cpuset_rwsem);
2606         cpus_read_unlock();
2607         css_put(&cs->css);
2608         return retval ?: nbytes;
2609 }
2610
2611 /*
2612  * for the common functions, 'private' gives the type of file
2613  */
2614
2615 static struct cftype legacy_files[] = {
2616         {
2617                 .name = "cpus",
2618                 .seq_show = cpuset_common_seq_show,
2619                 .write = cpuset_write_resmask,
2620                 .max_write_len = (100U + 6 * NR_CPUS),
2621                 .private = FILE_CPULIST,
2622         },
2623
2624         {
2625                 .name = "mems",
2626                 .seq_show = cpuset_common_seq_show,
2627                 .write = cpuset_write_resmask,
2628                 .max_write_len = (100U + 6 * MAX_NUMNODES),
2629                 .private = FILE_MEMLIST,
2630         },
2631
2632         {
2633                 .name = "effective_cpus",
2634                 .seq_show = cpuset_common_seq_show,
2635                 .private = FILE_EFFECTIVE_CPULIST,
2636         },
2637
2638         {
2639                 .name = "effective_mems",
2640                 .seq_show = cpuset_common_seq_show,
2641                 .private = FILE_EFFECTIVE_MEMLIST,
2642         },
2643
2644         {
2645                 .name = "cpu_exclusive",
2646                 .read_u64 = cpuset_read_u64,
2647                 .write_u64 = cpuset_write_u64,
2648                 .private = FILE_CPU_EXCLUSIVE,
2649         },
2650
2651         {
2652                 .name = "mem_exclusive",
2653                 .read_u64 = cpuset_read_u64,
2654                 .write_u64 = cpuset_write_u64,
2655                 .private = FILE_MEM_EXCLUSIVE,
2656         },
2657
2658         {
2659                 .name = "mem_hardwall",
2660                 .read_u64 = cpuset_read_u64,
2661                 .write_u64 = cpuset_write_u64,
2662                 .private = FILE_MEM_HARDWALL,
2663         },
2664
2665         {
2666                 .name = "sched_load_balance",
2667                 .read_u64 = cpuset_read_u64,
2668                 .write_u64 = cpuset_write_u64,
2669                 .private = FILE_SCHED_LOAD_BALANCE,
2670         },
2671
2672         {
2673                 .name = "sched_relax_domain_level",
2674                 .read_s64 = cpuset_read_s64,
2675                 .write_s64 = cpuset_write_s64,
2676                 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
2677         },
2678
2679         {
2680                 .name = "memory_migrate",
2681                 .read_u64 = cpuset_read_u64,
2682                 .write_u64 = cpuset_write_u64,
2683                 .private = FILE_MEMORY_MIGRATE,
2684         },
2685
2686         {
2687                 .name = "memory_pressure",
2688                 .read_u64 = cpuset_read_u64,
2689                 .private = FILE_MEMORY_PRESSURE,
2690         },
2691
2692         {
2693                 .name = "memory_spread_page",
2694                 .read_u64 = cpuset_read_u64,
2695                 .write_u64 = cpuset_write_u64,
2696                 .private = FILE_SPREAD_PAGE,
2697         },
2698
2699         {
2700                 .name = "memory_spread_slab",
2701                 .read_u64 = cpuset_read_u64,
2702                 .write_u64 = cpuset_write_u64,
2703                 .private = FILE_SPREAD_SLAB,
2704         },
2705
2706         {
2707                 .name = "memory_pressure_enabled",
2708                 .flags = CFTYPE_ONLY_ON_ROOT,
2709                 .read_u64 = cpuset_read_u64,
2710                 .write_u64 = cpuset_write_u64,
2711                 .private = FILE_MEMORY_PRESSURE_ENABLED,
2712         },
2713
2714         { }     /* terminate */
2715 };
2716
2717 /*
2718  * This is currently a minimal set for the default hierarchy. It can be
2719  * expanded later on by migrating more features and control files from v1.
2720  */
2721 static struct cftype dfl_files[] = {
2722         {
2723                 .name = "cpus",
2724                 .seq_show = cpuset_common_seq_show,
2725                 .write = cpuset_write_resmask,
2726                 .max_write_len = (100U + 6 * NR_CPUS),
2727                 .private = FILE_CPULIST,
2728                 .flags = CFTYPE_NOT_ON_ROOT,
2729         },
2730
2731         {
2732                 .name = "mems",
2733                 .seq_show = cpuset_common_seq_show,
2734                 .write = cpuset_write_resmask,
2735                 .max_write_len = (100U + 6 * MAX_NUMNODES),
2736                 .private = FILE_MEMLIST,
2737                 .flags = CFTYPE_NOT_ON_ROOT,
2738         },
2739
2740         {
2741                 .name = "cpus.effective",
2742                 .seq_show = cpuset_common_seq_show,
2743                 .private = FILE_EFFECTIVE_CPULIST,
2744         },
2745
2746         {
2747                 .name = "mems.effective",
2748                 .seq_show = cpuset_common_seq_show,
2749                 .private = FILE_EFFECTIVE_MEMLIST,
2750         },
2751
2752         {
2753                 .name = "cpus.partition",
2754                 .seq_show = sched_partition_show,
2755                 .write = sched_partition_write,
2756                 .private = FILE_PARTITION_ROOT,
2757                 .flags = CFTYPE_NOT_ON_ROOT,
2758                 .file_offset = offsetof(struct cpuset, partition_file),
2759         },
2760
2761         {
2762                 .name = "cpus.subpartitions",
2763                 .seq_show = cpuset_common_seq_show,
2764                 .private = FILE_SUBPARTS_CPULIST,
2765                 .flags = CFTYPE_DEBUG,
2766         },
2767
2768         { }     /* terminate */
2769 };
2770
2771
2772 /*
2773  *      cpuset_css_alloc - allocate a cpuset css
2774  *      cgrp:   control group that the new cpuset will be part of
2775  */
2776
2777 static struct cgroup_subsys_state *
2778 cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
2779 {
2780         struct cpuset *cs;
2781
2782         if (!parent_css)
2783                 return &top_cpuset.css;
2784
2785         cs = kzalloc(sizeof(*cs), GFP_KERNEL);
2786         if (!cs)
2787                 return ERR_PTR(-ENOMEM);
2788
2789         if (alloc_cpumasks(cs, NULL)) {
2790                 kfree(cs);
2791                 return ERR_PTR(-ENOMEM);
2792         }
2793
2794         __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
2795         nodes_clear(cs->mems_allowed);
2796         nodes_clear(cs->effective_mems);
2797         fmeter_init(&cs->fmeter);
2798         cs->relax_domain_level = -1;
2799
2800         /* Set CS_MEMORY_MIGRATE for default hierarchy */
2801         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
2802                 __set_bit(CS_MEMORY_MIGRATE, &cs->flags);
2803
2804         return &cs->css;
2805 }
2806
2807 static int cpuset_css_online(struct cgroup_subsys_state *css)
2808 {
2809         struct cpuset *cs = css_cs(css);
2810         struct cpuset *parent = parent_cs(cs);
2811         struct cpuset *tmp_cs;
2812         struct cgroup_subsys_state *pos_css;
2813
2814         if (!parent)
2815                 return 0;
2816
2817         cpus_read_lock();
2818         percpu_down_write(&cpuset_rwsem);
2819
2820         set_bit(CS_ONLINE, &cs->flags);
2821         if (is_spread_page(parent))
2822                 set_bit(CS_SPREAD_PAGE, &cs->flags);
2823         if (is_spread_slab(parent))
2824                 set_bit(CS_SPREAD_SLAB, &cs->flags);
2825
2826         cpuset_inc();
2827
2828         spin_lock_irq(&callback_lock);
2829         if (is_in_v2_mode()) {
2830                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
2831                 cs->effective_mems = parent->effective_mems;
2832                 cs->use_parent_ecpus = true;
2833                 parent->child_ecpus_count++;
2834         }
2835         spin_unlock_irq(&callback_lock);
2836
2837         if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
2838                 goto out_unlock;
2839
2840         /*
2841          * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
2842          * set.  This flag handling is implemented in cgroup core for
2843          * histrical reasons - the flag may be specified during mount.
2844          *
2845          * Currently, if any sibling cpusets have exclusive cpus or mem, we
2846          * refuse to clone the configuration - thereby refusing the task to
2847          * be entered, and as a result refusing the sys_unshare() or
2848          * clone() which initiated it.  If this becomes a problem for some
2849          * users who wish to allow that scenario, then this could be
2850          * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
2851          * (and likewise for mems) to the new cgroup.
2852          */
2853         rcu_read_lock();
2854         cpuset_for_each_child(tmp_cs, pos_css, parent) {
2855                 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2856                         rcu_read_unlock();
2857                         goto out_unlock;
2858                 }
2859         }
2860         rcu_read_unlock();
2861
2862         spin_lock_irq(&callback_lock);
2863         cs->mems_allowed = parent->mems_allowed;
2864         cs->effective_mems = parent->mems_allowed;
2865         cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
2866         cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
2867         spin_unlock_irq(&callback_lock);
2868 out_unlock:
2869         percpu_up_write(&cpuset_rwsem);
2870         cpus_read_unlock();
2871         return 0;
2872 }
2873
2874 /*
2875  * If the cpuset being removed has its flag 'sched_load_balance'
2876  * enabled, then simulate turning sched_load_balance off, which
2877  * will call rebuild_sched_domains_locked(). That is not needed
2878  * in the default hierarchy where only changes in partition
2879  * will cause repartitioning.
2880  *
2881  * If the cpuset has the 'sched.partition' flag enabled, simulate
2882  * turning 'sched.partition" off.
2883  */
2884
2885 static void cpuset_css_offline(struct cgroup_subsys_state *css)
2886 {
2887         struct cpuset *cs = css_cs(css);
2888
2889         cpus_read_lock();
2890         percpu_down_write(&cpuset_rwsem);
2891
2892         if (is_partition_root(cs))
2893                 update_prstate(cs, 0);
2894
2895         if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
2896             is_sched_load_balance(cs))
2897                 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2898
2899         if (cs->use_parent_ecpus) {
2900                 struct cpuset *parent = parent_cs(cs);
2901
2902                 cs->use_parent_ecpus = false;
2903                 parent->child_ecpus_count--;
2904         }
2905
2906         cpuset_dec();
2907         clear_bit(CS_ONLINE, &cs->flags);
2908
2909         percpu_up_write(&cpuset_rwsem);
2910         cpus_read_unlock();
2911 }
2912
2913 static void cpuset_css_free(struct cgroup_subsys_state *css)
2914 {
2915         struct cpuset *cs = css_cs(css);
2916
2917         free_cpuset(cs);
2918 }
2919
2920 static void cpuset_bind(struct cgroup_subsys_state *root_css)
2921 {
2922         percpu_down_write(&cpuset_rwsem);
2923         spin_lock_irq(&callback_lock);
2924
2925         if (is_in_v2_mode()) {
2926                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2927                 top_cpuset.mems_allowed = node_possible_map;
2928         } else {
2929                 cpumask_copy(top_cpuset.cpus_allowed,
2930                              top_cpuset.effective_cpus);
2931                 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2932         }
2933
2934         spin_unlock_irq(&callback_lock);
2935         percpu_up_write(&cpuset_rwsem);
2936 }
2937
2938 /*
2939  * Make sure the new task conform to the current state of its parent,
2940  * which could have been changed by cpuset just after it inherits the
2941  * state from the parent and before it sits on the cgroup's task list.
2942  */
2943 static void cpuset_fork(struct task_struct *task)
2944 {
2945         if (task_css_is_root(task, cpuset_cgrp_id))
2946                 return;
2947
2948         set_cpus_allowed_ptr(task, current->cpus_ptr);
2949         task->mems_allowed = current->mems_allowed;
2950 }
2951
2952 struct cgroup_subsys cpuset_cgrp_subsys = {
2953         .css_alloc      = cpuset_css_alloc,
2954         .css_online     = cpuset_css_online,
2955         .css_offline    = cpuset_css_offline,
2956         .css_free       = cpuset_css_free,
2957         .can_attach     = cpuset_can_attach,
2958         .cancel_attach  = cpuset_cancel_attach,
2959         .attach         = cpuset_attach,
2960         .post_attach    = cpuset_post_attach,
2961         .bind           = cpuset_bind,
2962         .fork           = cpuset_fork,
2963         .legacy_cftypes = legacy_files,
2964         .dfl_cftypes    = dfl_files,
2965         .early_init     = true,
2966         .threaded       = true,
2967 };
2968
2969 /**
2970  * cpuset_init - initialize cpusets at system boot
2971  *
2972  * Description: Initialize top_cpuset
2973  **/
2974
2975 int __init cpuset_init(void)
2976 {
2977         BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
2978
2979         BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
2980         BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
2981         BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
2982
2983         cpumask_setall(top_cpuset.cpus_allowed);
2984         nodes_setall(top_cpuset.mems_allowed);
2985         cpumask_setall(top_cpuset.effective_cpus);
2986         nodes_setall(top_cpuset.effective_mems);
2987
2988         fmeter_init(&top_cpuset.fmeter);
2989         set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
2990         top_cpuset.relax_domain_level = -1;
2991
2992         BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
2993
2994         return 0;
2995 }
2996
2997 /*
2998  * If CPU and/or memory hotplug handlers, below, unplug any CPUs
2999  * or memory nodes, we need to walk over the cpuset hierarchy,
3000  * removing that CPU or node from all cpusets.  If this removes the
3001  * last CPU or node from a cpuset, then move the tasks in the empty
3002  * cpuset to its next-highest non-empty parent.
3003  */
3004 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
3005 {
3006         struct cpuset *parent;
3007
3008         /*
3009          * Find its next-highest non-empty parent, (top cpuset
3010          * has online cpus, so can't be empty).
3011          */
3012         parent = parent_cs(cs);
3013         while (cpumask_empty(parent->cpus_allowed) ||
3014                         nodes_empty(parent->mems_allowed))
3015                 parent = parent_cs(parent);
3016
3017         if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
3018                 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
3019                 pr_cont_cgroup_name(cs->css.cgroup);
3020                 pr_cont("\n");
3021         }
3022 }
3023
3024 static void
3025 hotplug_update_tasks_legacy(struct cpuset *cs,
3026                             struct cpumask *new_cpus, nodemask_t *new_mems,
3027                             bool cpus_updated, bool mems_updated)
3028 {
3029         bool is_empty;
3030
3031         spin_lock_irq(&callback_lock);
3032         cpumask_copy(cs->cpus_allowed, new_cpus);
3033         cpumask_copy(cs->effective_cpus, new_cpus);
3034         cs->mems_allowed = *new_mems;
3035         cs->effective_mems = *new_mems;
3036         spin_unlock_irq(&callback_lock);
3037
3038         /*
3039          * Don't call update_tasks_cpumask() if the cpuset becomes empty,
3040          * as the tasks will be migratecd to an ancestor.
3041          */
3042         if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
3043                 update_tasks_cpumask(cs);
3044         if (mems_updated && !nodes_empty(cs->mems_allowed))
3045                 update_tasks_nodemask(cs);
3046
3047         is_empty = cpumask_empty(cs->cpus_allowed) ||
3048                    nodes_empty(cs->mems_allowed);
3049
3050         percpu_up_write(&cpuset_rwsem);
3051
3052         /*
3053          * Move tasks to the nearest ancestor with execution resources,
3054          * This is full cgroup operation which will also call back into
3055          * cpuset. Should be done outside any lock.
3056          */
3057         if (is_empty)
3058                 remove_tasks_in_empty_cpuset(cs);
3059
3060         percpu_down_write(&cpuset_rwsem);
3061 }
3062
3063 static void
3064 hotplug_update_tasks(struct cpuset *cs,
3065                      struct cpumask *new_cpus, nodemask_t *new_mems,
3066                      bool cpus_updated, bool mems_updated)
3067 {
3068         if (cpumask_empty(new_cpus))
3069                 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
3070         if (nodes_empty(*new_mems))
3071                 *new_mems = parent_cs(cs)->effective_mems;
3072
3073         spin_lock_irq(&callback_lock);
3074         cpumask_copy(cs->effective_cpus, new_cpus);
3075         cs->effective_mems = *new_mems;
3076         spin_unlock_irq(&callback_lock);
3077
3078         if (cpus_updated)
3079                 update_tasks_cpumask(cs);
3080         if (mems_updated)
3081                 update_tasks_nodemask(cs);
3082 }
3083
3084 static bool force_rebuild;
3085
3086 void cpuset_force_rebuild(void)
3087 {
3088         force_rebuild = true;
3089 }
3090
3091 /**
3092  * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
3093  * @cs: cpuset in interest
3094  * @tmp: the tmpmasks structure pointer
3095  *
3096  * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
3097  * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
3098  * all its tasks are moved to the nearest ancestor with both resources.
3099  */
3100 static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
3101 {
3102         static cpumask_t new_cpus;
3103         static nodemask_t new_mems;
3104         bool cpus_updated;
3105         bool mems_updated;
3106         struct cpuset *parent;
3107 retry:
3108         wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
3109
3110         percpu_down_write(&cpuset_rwsem);
3111
3112         /*
3113          * We have raced with task attaching. We wait until attaching
3114          * is finished, so we won't attach a task to an empty cpuset.
3115          */
3116         if (cs->attach_in_progress) {
3117                 percpu_up_write(&cpuset_rwsem);
3118                 goto retry;
3119         }
3120
3121         parent = parent_cs(cs);
3122         compute_effective_cpumask(&new_cpus, cs, parent);
3123         nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
3124
3125         if (cs->nr_subparts_cpus)
3126                 /*
3127                  * Make sure that CPUs allocated to child partitions
3128                  * do not show up in effective_cpus.
3129                  */
3130                 cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
3131
3132         if (!tmp || !cs->partition_root_state)
3133                 goto update_tasks;
3134
3135         /*
3136          * In the unlikely event that a partition root has empty
3137          * effective_cpus or its parent becomes erroneous, we have to
3138          * transition it to the erroneous state.
3139          */
3140         if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
3141            (parent->partition_root_state == PRS_ERROR))) {
3142                 if (cs->nr_subparts_cpus) {
3143                         spin_lock_irq(&callback_lock);
3144                         cs->nr_subparts_cpus = 0;
3145                         cpumask_clear(cs->subparts_cpus);
3146                         spin_unlock_irq(&callback_lock);
3147                         compute_effective_cpumask(&new_cpus, cs, parent);
3148                 }
3149
3150                 /*
3151                  * If the effective_cpus is empty because the child
3152                  * partitions take away all the CPUs, we can keep
3153                  * the current partition and let the child partitions
3154                  * fight for available CPUs.
3155                  */
3156                 if ((parent->partition_root_state == PRS_ERROR) ||
3157                      cpumask_empty(&new_cpus)) {
3158                         int old_prs;
3159
3160                         update_parent_subparts_cpumask(cs, partcmd_disable,
3161                                                        NULL, tmp);
3162                         old_prs = cs->partition_root_state;
3163                         if (old_prs != PRS_ERROR) {
3164                                 spin_lock_irq(&callback_lock);
3165                                 cs->partition_root_state = PRS_ERROR;
3166                                 spin_unlock_irq(&callback_lock);
3167                                 notify_partition_change(cs, old_prs, PRS_ERROR);
3168                         }
3169                 }
3170                 cpuset_force_rebuild();
3171         }
3172
3173         /*
3174          * On the other hand, an erroneous partition root may be transitioned
3175          * back to a regular one or a partition root with no CPU allocated
3176          * from the parent may change to erroneous.
3177          */
3178         if (is_partition_root(parent) &&
3179            ((cs->partition_root_state == PRS_ERROR) ||
3180             !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
3181              update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
3182                 cpuset_force_rebuild();
3183
3184 update_tasks:
3185         cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
3186         mems_updated = !nodes_equal(new_mems, cs->effective_mems);
3187
3188         if (mems_updated)
3189                 check_insane_mems_config(&new_mems);
3190
3191         if (is_in_v2_mode())
3192                 hotplug_update_tasks(cs, &new_cpus, &new_mems,
3193                                      cpus_updated, mems_updated);
3194         else
3195                 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
3196                                             cpus_updated, mems_updated);
3197
3198         percpu_up_write(&cpuset_rwsem);
3199 }
3200
3201 /**
3202  * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
3203  *
3204  * This function is called after either CPU or memory configuration has
3205  * changed and updates cpuset accordingly.  The top_cpuset is always
3206  * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
3207  * order to make cpusets transparent (of no affect) on systems that are
3208  * actively using CPU hotplug but making no active use of cpusets.
3209  *
3210  * Non-root cpusets are only affected by offlining.  If any CPUs or memory
3211  * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
3212  * all descendants.
3213  *
3214  * Note that CPU offlining during suspend is ignored.  We don't modify
3215  * cpusets across suspend/resume cycles at all.
3216  */
3217 static void cpuset_hotplug_workfn(struct work_struct *work)
3218 {
3219         static cpumask_t new_cpus;
3220         static nodemask_t new_mems;
3221         bool cpus_updated, mems_updated;
3222         bool on_dfl = is_in_v2_mode();
3223         struct tmpmasks tmp, *ptmp = NULL;
3224
3225         if (on_dfl && !alloc_cpumasks(NULL, &tmp))
3226                 ptmp = &tmp;
3227
3228         percpu_down_write(&cpuset_rwsem);
3229
3230         /* fetch the available cpus/mems and find out which changed how */
3231         cpumask_copy(&new_cpus, cpu_active_mask);
3232         new_mems = node_states[N_MEMORY];
3233
3234         /*
3235          * If subparts_cpus is populated, it is likely that the check below
3236          * will produce a false positive on cpus_updated when the cpu list
3237          * isn't changed. It is extra work, but it is better to be safe.
3238          */
3239         cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
3240         mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
3241
3242         /*
3243          * In the rare case that hotplug removes all the cpus in subparts_cpus,
3244          * we assumed that cpus are updated.
3245          */
3246         if (!cpus_updated && top_cpuset.nr_subparts_cpus)
3247                 cpus_updated = true;
3248
3249         /* synchronize cpus_allowed to cpu_active_mask */
3250         if (cpus_updated) {
3251                 spin_lock_irq(&callback_lock);
3252                 if (!on_dfl)
3253                         cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
3254                 /*
3255                  * Make sure that CPUs allocated to child partitions
3256                  * do not show up in effective_cpus. If no CPU is left,
3257                  * we clear the subparts_cpus & let the child partitions
3258                  * fight for the CPUs again.
3259                  */
3260                 if (top_cpuset.nr_subparts_cpus) {
3261                         if (cpumask_subset(&new_cpus,
3262                                            top_cpuset.subparts_cpus)) {
3263                                 top_cpuset.nr_subparts_cpus = 0;
3264                                 cpumask_clear(top_cpuset.subparts_cpus);
3265                         } else {
3266                                 cpumask_andnot(&new_cpus, &new_cpus,
3267                                                top_cpuset.subparts_cpus);
3268                         }
3269                 }
3270                 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
3271                 spin_unlock_irq(&callback_lock);
3272                 /* we don't mess with cpumasks of tasks in top_cpuset */
3273         }
3274
3275         /* synchronize mems_allowed to N_MEMORY */
3276         if (mems_updated) {
3277                 spin_lock_irq(&callback_lock);
3278                 if (!on_dfl)
3279                         top_cpuset.mems_allowed = new_mems;
3280                 top_cpuset.effective_mems = new_mems;
3281                 spin_unlock_irq(&callback_lock);
3282                 update_tasks_nodemask(&top_cpuset);
3283         }
3284
3285         percpu_up_write(&cpuset_rwsem);
3286
3287         /* if cpus or mems changed, we need to propagate to descendants */
3288         if (cpus_updated || mems_updated) {
3289                 struct cpuset *cs;
3290                 struct cgroup_subsys_state *pos_css;
3291
3292                 rcu_read_lock();
3293                 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
3294                         if (cs == &top_cpuset || !css_tryget_online(&cs->css))
3295                                 continue;
3296                         rcu_read_unlock();
3297
3298                         cpuset_hotplug_update_tasks(cs, ptmp);
3299
3300                         rcu_read_lock();
3301                         css_put(&cs->css);
3302                 }
3303                 rcu_read_unlock();
3304         }
3305
3306         /* rebuild sched domains if cpus_allowed has changed */
3307         if (cpus_updated || force_rebuild) {
3308                 force_rebuild = false;
3309                 rebuild_sched_domains();
3310         }
3311
3312         free_cpumasks(NULL, ptmp);
3313 }
3314
3315 void cpuset_update_active_cpus(void)
3316 {
3317         /*
3318          * We're inside cpu hotplug critical region which usually nests
3319          * inside cgroup synchronization.  Bounce actual hotplug processing
3320          * to a work item to avoid reverse locking order.
3321          */
3322         schedule_work(&cpuset_hotplug_work);
3323 }
3324
3325 void cpuset_wait_for_hotplug(void)
3326 {
3327         flush_work(&cpuset_hotplug_work);
3328 }
3329
3330 /*
3331  * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
3332  * Call this routine anytime after node_states[N_MEMORY] changes.
3333  * See cpuset_update_active_cpus() for CPU hotplug handling.
3334  */
3335 static int cpuset_track_online_nodes(struct notifier_block *self,
3336                                 unsigned long action, void *arg)
3337 {
3338         schedule_work(&cpuset_hotplug_work);
3339         return NOTIFY_OK;
3340 }
3341
3342 static struct notifier_block cpuset_track_online_nodes_nb = {
3343         .notifier_call = cpuset_track_online_nodes,
3344         .priority = 10,         /* ??! */
3345 };
3346
3347 /**
3348  * cpuset_init_smp - initialize cpus_allowed
3349  *
3350  * Description: Finish top cpuset after cpu, node maps are initialized
3351  */
3352 void __init cpuset_init_smp(void)
3353 {
3354         cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
3355         top_cpuset.mems_allowed = node_states[N_MEMORY];
3356         top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
3357
3358         cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
3359         top_cpuset.effective_mems = node_states[N_MEMORY];
3360
3361         register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
3362
3363         cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
3364         BUG_ON(!cpuset_migrate_mm_wq);
3365 }
3366
3367 /**
3368  * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
3369  * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
3370  * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
3371  *
3372  * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
3373  * attached to the specified @tsk.  Guaranteed to return some non-empty
3374  * subset of cpu_online_mask, even if this means going outside the
3375  * tasks cpuset.
3376  **/
3377
3378 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
3379 {
3380         unsigned long flags;
3381
3382         spin_lock_irqsave(&callback_lock, flags);
3383         guarantee_online_cpus(tsk, pmask);
3384         spin_unlock_irqrestore(&callback_lock, flags);
3385 }
3386
3387 /**
3388  * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
3389  * @tsk: pointer to task_struct with which the scheduler is struggling
3390  *
3391  * Description: In the case that the scheduler cannot find an allowed cpu in
3392  * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
3393  * mode however, this value is the same as task_cs(tsk)->effective_cpus,
3394  * which will not contain a sane cpumask during cases such as cpu hotplugging.
3395  * This is the absolute last resort for the scheduler and it is only used if
3396  * _every_ other avenue has been traveled.
3397  *
3398  * Returns true if the affinity of @tsk was changed, false otherwise.
3399  **/
3400
3401 bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
3402 {
3403         const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
3404         const struct cpumask *cs_mask;
3405         bool changed = false;
3406
3407         rcu_read_lock();
3408         cs_mask = task_cs(tsk)->cpus_allowed;
3409         if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
3410                 do_set_cpus_allowed(tsk, cs_mask);
3411                 changed = true;
3412         }
3413         rcu_read_unlock();
3414
3415         /*
3416          * We own tsk->cpus_allowed, nobody can change it under us.
3417          *
3418          * But we used cs && cs->cpus_allowed lockless and thus can
3419          * race with cgroup_attach_task() or update_cpumask() and get
3420          * the wrong tsk->cpus_allowed. However, both cases imply the
3421          * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
3422          * which takes task_rq_lock().
3423          *
3424          * If we are called after it dropped the lock we must see all
3425          * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
3426          * set any mask even if it is not right from task_cs() pov,
3427          * the pending set_cpus_allowed_ptr() will fix things.
3428          *
3429          * select_fallback_rq() will fix things ups and set cpu_possible_mask
3430          * if required.
3431          */
3432         return changed;
3433 }
3434
3435 void __init cpuset_init_current_mems_allowed(void)
3436 {
3437         nodes_setall(current->mems_allowed);
3438 }
3439
3440 /**
3441  * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
3442  * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
3443  *
3444  * Description: Returns the nodemask_t mems_allowed of the cpuset
3445  * attached to the specified @tsk.  Guaranteed to return some non-empty
3446  * subset of node_states[N_MEMORY], even if this means going outside the
3447  * tasks cpuset.
3448  **/
3449
3450 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
3451 {
3452         nodemask_t mask;
3453         unsigned long flags;
3454
3455         spin_lock_irqsave(&callback_lock, flags);
3456         rcu_read_lock();
3457         guarantee_online_mems(task_cs(tsk), &mask);
3458         rcu_read_unlock();
3459         spin_unlock_irqrestore(&callback_lock, flags);
3460
3461         return mask;
3462 }
3463
3464 /**
3465  * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed
3466  * @nodemask: the nodemask to be checked
3467  *
3468  * Are any of the nodes in the nodemask allowed in current->mems_allowed?
3469  */
3470 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
3471 {
3472         return nodes_intersects(*nodemask, current->mems_allowed);
3473 }
3474
3475 /*
3476  * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
3477  * mem_hardwall ancestor to the specified cpuset.  Call holding
3478  * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
3479  * (an unusual configuration), then returns the root cpuset.
3480  */
3481 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
3482 {
3483         while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
3484                 cs = parent_cs(cs);
3485         return cs;
3486 }
3487
3488 /**
3489  * cpuset_node_allowed - Can we allocate on a memory node?
3490  * @node: is this an allowed node?
3491  * @gfp_mask: memory allocation flags
3492  *
3493  * If we're in interrupt, yes, we can always allocate.  If @node is set in
3494  * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
3495  * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
3496  * yes.  If current has access to memory reserves as an oom victim, yes.
3497  * Otherwise, no.
3498  *
3499  * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
3500  * and do not allow allocations outside the current tasks cpuset
3501  * unless the task has been OOM killed.
3502  * GFP_KERNEL allocations are not so marked, so can escape to the
3503  * nearest enclosing hardwalled ancestor cpuset.
3504  *
3505  * Scanning up parent cpusets requires callback_lock.  The
3506  * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
3507  * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
3508  * current tasks mems_allowed came up empty on the first pass over
3509  * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
3510  * cpuset are short of memory, might require taking the callback_lock.
3511  *
3512  * The first call here from mm/page_alloc:get_page_from_freelist()
3513  * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
3514  * so no allocation on a node outside the cpuset is allowed (unless
3515  * in interrupt, of course).
3516  *
3517  * The second pass through get_page_from_freelist() doesn't even call
3518  * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
3519  * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
3520  * in alloc_flags.  That logic and the checks below have the combined
3521  * affect that:
3522  *      in_interrupt - any node ok (current task context irrelevant)
3523  *      GFP_ATOMIC   - any node ok
3524  *      tsk_is_oom_victim   - any node ok
3525  *      GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
3526  *      GFP_USER     - only nodes in current tasks mems allowed ok.
3527  */
3528 bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
3529 {
3530         struct cpuset *cs;              /* current cpuset ancestors */
3531         bool allowed;                   /* is allocation in zone z allowed? */
3532         unsigned long flags;
3533
3534         if (in_interrupt())
3535                 return true;
3536         if (node_isset(node, current->mems_allowed))
3537                 return true;
3538         /*
3539          * Allow tasks that have access to memory reserves because they have
3540          * been OOM killed to get memory anywhere.
3541          */
3542         if (unlikely(tsk_is_oom_victim(current)))
3543                 return true;
3544         if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
3545                 return false;
3546
3547         if (current->flags & PF_EXITING) /* Let dying task have memory */
3548                 return true;
3549
3550         /* Not hardwall and node outside mems_allowed: scan up cpusets */
3551         spin_lock_irqsave(&callback_lock, flags);
3552
3553         rcu_read_lock();
3554         cs = nearest_hardwall_ancestor(task_cs(current));
3555         allowed = node_isset(node, cs->mems_allowed);
3556         rcu_read_unlock();
3557
3558         spin_unlock_irqrestore(&callback_lock, flags);
3559         return allowed;
3560 }
3561
3562 /**
3563  * cpuset_mem_spread_node() - On which node to begin search for a file page
3564  * cpuset_slab_spread_node() - On which node to begin search for a slab page
3565  *
3566  * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
3567  * tasks in a cpuset with is_spread_page or is_spread_slab set),
3568  * and if the memory allocation used cpuset_mem_spread_node()
3569  * to determine on which node to start looking, as it will for
3570  * certain page cache or slab cache pages such as used for file
3571  * system buffers and inode caches, then instead of starting on the
3572  * local node to look for a free page, rather spread the starting
3573  * node around the tasks mems_allowed nodes.
3574  *
3575  * We don't have to worry about the returned node being offline
3576  * because "it can't happen", and even if it did, it would be ok.
3577  *
3578  * The routines calling guarantee_online_mems() are careful to
3579  * only set nodes in task->mems_allowed that are online.  So it
3580  * should not be possible for the following code to return an
3581  * offline node.  But if it did, that would be ok, as this routine
3582  * is not returning the node where the allocation must be, only
3583  * the node where the search should start.  The zonelist passed to
3584  * __alloc_pages() will include all nodes.  If the slab allocator
3585  * is passed an offline node, it will fall back to the local node.
3586  * See kmem_cache_alloc_node().
3587  */
3588
3589 static int cpuset_spread_node(int *rotor)
3590 {
3591         return *rotor = next_node_in(*rotor, current->mems_allowed);
3592 }
3593
3594 int cpuset_mem_spread_node(void)
3595 {
3596         if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
3597                 current->cpuset_mem_spread_rotor =
3598                         node_random(&current->mems_allowed);
3599
3600         return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
3601 }
3602
3603 int cpuset_slab_spread_node(void)
3604 {
3605         if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
3606                 current->cpuset_slab_spread_rotor =
3607                         node_random(&current->mems_allowed);
3608
3609         return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
3610 }
3611
3612 EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
3613
3614 /**
3615  * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
3616  * @tsk1: pointer to task_struct of some task.
3617  * @tsk2: pointer to task_struct of some other task.
3618  *
3619  * Description: Return true if @tsk1's mems_allowed intersects the
3620  * mems_allowed of @tsk2.  Used by the OOM killer to determine if
3621  * one of the task's memory usage might impact the memory available
3622  * to the other.
3623  **/
3624
3625 int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
3626                                    const struct task_struct *tsk2)
3627 {
3628         return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
3629 }
3630
3631 /**
3632  * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
3633  *
3634  * Description: Prints current's name, cpuset name, and cached copy of its
3635  * mems_allowed to the kernel log.
3636  */
3637 void cpuset_print_current_mems_allowed(void)
3638 {
3639         struct cgroup *cgrp;
3640
3641         rcu_read_lock();
3642
3643         cgrp = task_cs(current)->css.cgroup;
3644         pr_cont(",cpuset=");
3645         pr_cont_cgroup_name(cgrp);
3646         pr_cont(",mems_allowed=%*pbl",
3647                 nodemask_pr_args(&current->mems_allowed));
3648
3649         rcu_read_unlock();
3650 }
3651
3652 /*
3653  * Collection of memory_pressure is suppressed unless
3654  * this flag is enabled by writing "1" to the special
3655  * cpuset file 'memory_pressure_enabled' in the root cpuset.
3656  */
3657
3658 int cpuset_memory_pressure_enabled __read_mostly;
3659
3660 /**
3661  * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
3662  *
3663  * Keep a running average of the rate of synchronous (direct)
3664  * page reclaim efforts initiated by tasks in each cpuset.
3665  *
3666  * This represents the rate at which some task in the cpuset
3667  * ran low on memory on all nodes it was allowed to use, and
3668  * had to enter the kernels page reclaim code in an effort to
3669  * create more free memory by tossing clean pages or swapping
3670  * or writing dirty pages.
3671  *
3672  * Display to user space in the per-cpuset read-only file
3673  * "memory_pressure".  Value displayed is an integer
3674  * representing the recent rate of entry into the synchronous
3675  * (direct) page reclaim by any task attached to the cpuset.
3676  **/
3677
3678 void __cpuset_memory_pressure_bump(void)
3679 {
3680         rcu_read_lock();
3681         fmeter_markevent(&task_cs(current)->fmeter);
3682         rcu_read_unlock();
3683 }
3684
3685 #ifdef CONFIG_PROC_PID_CPUSET
3686 /*
3687  * proc_cpuset_show()
3688  *  - Print tasks cpuset path into seq_file.
3689  *  - Used for /proc/<pid>/cpuset.
3690  *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
3691  *    doesn't really matter if tsk->cpuset changes after we read it,
3692  *    and we take cpuset_rwsem, keeping cpuset_attach() from changing it
3693  *    anyway.
3694  */
3695 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
3696                      struct pid *pid, struct task_struct *tsk)
3697 {
3698         char *buf;
3699         struct cgroup_subsys_state *css;
3700         int retval;
3701
3702         retval = -ENOMEM;
3703         buf = kmalloc(PATH_MAX, GFP_KERNEL);
3704         if (!buf)
3705                 goto out;
3706
3707         css = task_get_css(tsk, cpuset_cgrp_id);
3708         retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
3709                                 current->nsproxy->cgroup_ns);
3710         css_put(css);
3711         if (retval >= PATH_MAX)
3712                 retval = -ENAMETOOLONG;
3713         if (retval < 0)
3714                 goto out_free;
3715         seq_puts(m, buf);
3716         seq_putc(m, '\n');
3717         retval = 0;
3718 out_free:
3719         kfree(buf);
3720 out:
3721         return retval;
3722 }
3723 #endif /* CONFIG_PROC_PID_CPUSET */
3724
3725 /* Display task mems_allowed in /proc/<pid>/status file. */
3726 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
3727 {
3728         seq_printf(m, "Mems_allowed:\t%*pb\n",
3729                    nodemask_pr_args(&task->mems_allowed));
3730         seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
3731                    nodemask_pr_args(&task->mems_allowed));
3732 }