kernel/cgroup/cpuset.c

   1 /*
   2  *  kernel/cpuset.c
   3  *
   4  *  Processor and Memory placement constraints for sets of tasks.
   5  *
   6  *  Copyright (C) 2003 BULL SA.
   7  *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
   8  *  Copyright (C) 2006 Google, Inc
   9  *
  10  *  Portions derived from Patrick Mochel's sysfs code.
  11  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
  12  *
  13  *  2003-10-10 Written by Simon Derr.
  14  *  2003-10-22 Updates by Stephen Hemminger.
  15  *  2004 May-July Rework by Paul Jackson.
  16  *  2006 Rework by Paul Menage to use generic cgroups
  17  *  2008 Rework of the scheduler domains and CPU hotplug handling
  18  *       by Max Krasnyansky
  19  *
  20  *  This file is subject to the terms and conditions of the GNU General Public
  21  *  License.  See the file COPYING in the main directory of the Linux
  22  *  distribution for more details.
  23  */
  24
  25 #include <linux/cpu.h>
  26 #include <linux/cpumask.h>
  27 #include <linux/cpuset.h>
  28 #include <linux/err.h>
  29 #include <linux/errno.h>
  30 #include <linux/file.h>
  31 #include <linux/fs.h>
  32 #include <linux/init.h>
  33 #include <linux/interrupt.h>
  34 #include <linux/kernel.h>
  35 #include <linux/kmod.h>
  36 #include <linux/list.h>
  37 #include <linux/mempolicy.h>
  38 #include <linux/mm.h>
  39 #include <linux/memory.h>
  40 #include <linux/export.h>
  41 #include <linux/mount.h>
  42 #include <linux/namei.h>
  43 #include <linux/pagemap.h>
  44 #include <linux/proc_fs.h>
  45 #include <linux/rcupdate.h>
  46 #include <linux/sched.h>
  47 #include <linux/sched/mm.h>
  48 #include <linux/sched/task.h>
  49 #include <linux/seq_file.h>
  50 #include <linux/security.h>
  51 #include <linux/slab.h>
  52 #include <linux/spinlock.h>
  53 #include <linux/stat.h>
  54 #include <linux/string.h>
  55 #include <linux/time.h>
  56 #include <linux/time64.h>
  57 #include <linux/backing-dev.h>
  58 #include <linux/sort.h>
  59 #include <linux/oom.h>
  60 #include <linux/sched/isolation.h>
  61 #include <linux/uaccess.h>
  62 #include <linux/atomic.h>
  63 #include <linux/mutex.h>
  64 #include <linux/cgroup.h>
  65 #include <linux/wait.h>
  66
  67 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
  68 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
  69
  70 /* See "Frequency meter" comments, below. */
  71
  72 struct fmeter {
  73         int cnt;                /* unprocessed events count */
  74         int val;                /* most recent output value */
  75         time64_t time;          /* clock (secs) when val computed */
  76         spinlock_t lock;        /* guards read or write of above */
  77 };
  78
  79 struct cpuset {
  80         struct cgroup_subsys_state css;
  81
  82         unsigned long flags;            /* "unsigned long" so bitops work */
  83
  84         /*
  85          * On default hierarchy:
  86          *
  87          * The user-configured masks can only be changed by writing to
  88          * cpuset.cpus and cpuset.mems, and won't be limited by the
  89          * parent masks.
  90          *
  91          * The effective masks is the real masks that apply to the tasks
  92          * in the cpuset. They may be changed if the configured masks are
  93          * changed or hotplug happens.
  94          *
  95          * effective_mask == configured_mask & parent's effective_mask,
  96          * and if it ends up empty, it will inherit the parent's mask.
  97          *
  98          *
  99          * On legacy hierachy:
 100          *
 101          * The user-configured masks are always the same with effective masks.
 102          */
 103
 104         /* user-configured CPUs and Memory Nodes allow to tasks */
 105         cpumask_var_t cpus_allowed;
 106         nodemask_t mems_allowed;
 107
 108         /* effective CPUs and Memory Nodes allow to tasks */
 109         cpumask_var_t effective_cpus;
 110         nodemask_t effective_mems;
 111
 112         /*
 113          * CPUs allocated to child sub-partitions (default hierarchy only)
 114          * - CPUs granted by the parent = effective_cpus U subparts_cpus
 115          * - effective_cpus and subparts_cpus are mutually exclusive.
 116          */
 117         cpumask_var_t subparts_cpus;
 118
 119         /*
 120          * This is old Memory Nodes tasks took on.
 121          *
 122          * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
 123          * - A new cpuset's old_mems_allowed is initialized when some
 124          *   task is moved into it.
 125          * - old_mems_allowed is used in cpuset_migrate_mm() when we change
 126          *   cpuset.mems_allowed and have tasks' nodemask updated, and
 127          *   then old_mems_allowed is updated to mems_allowed.
 128          */
 129         nodemask_t old_mems_allowed;
 130
 131         struct fmeter fmeter;           /* memory_pressure filter */
 132
 133         /*
 134          * Tasks are being attached to this cpuset.  Used to prevent
 135          * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
 136          */
 137         int attach_in_progress;
 138
 139         /* partition number for rebuild_sched_domains() */
 140         int pn;
 141
 142         /* for custom sched domain */
 143         int relax_domain_level;
 144
 145         /* number of CPUs in subparts_cpus */
 146         int nr_subparts_cpus;
 147
 148         /* partition root state */
 149         int partition_root_state;
 150
 151         /*
 152          * Default hierarchy only:
 153          * use_parent_ecpus - set if using parent's effective_cpus
 154          * child_ecpus_count - # of children with use_parent_ecpus set
 155          */
 156         int use_parent_ecpus;
 157         int child_ecpus_count;
 158 };
 159
 160 /*
 161  * Partition root states:
 162  *
 163  *   0 - not a partition root
 164  *
 165  *   1 - partition root
 166  *
 167  *  -1 - invalid partition root
 168  *       None of the cpus in cpus_allowed can be put into the parent's
 169  *       subparts_cpus. In this case, the cpuset is not a real partition
 170  *       root anymore.  However, the CPU_EXCLUSIVE bit will still be set
 171  *       and the cpuset can be restored back to a partition root if the
 172  *       parent cpuset can give more CPUs back to this child cpuset.
 173  */
 174 #define PRS_DISABLED            0
 175 #define PRS_ENABLED             1
 176 #define PRS_ERROR               -1
 177
 178 /*
 179  * Temporary cpumasks for working with partitions that are passed among
 180  * functions to avoid memory allocation in inner functions.
 181  */
 182 struct tmpmasks {
 183         cpumask_var_t addmask, delmask; /* For partition root */
 184         cpumask_var_t new_cpus;         /* For update_cpumasks_hier() */
 185 };
 186
 187 static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
 188 {
 189         return css ? container_of(css, struct cpuset, css) : NULL;
 190 }
 191
 192 /* Retrieve the cpuset for a task */
 193 static inline struct cpuset *task_cs(struct task_struct *task)
 194 {
 195         return css_cs(task_css(task, cpuset_cgrp_id));
 196 }
 197
 198 static inline struct cpuset *parent_cs(struct cpuset *cs)
 199 {
 200         return css_cs(cs->css.parent);
 201 }
 202
 203 #ifdef CONFIG_NUMA
 204 static inline bool task_has_mempolicy(struct task_struct *task)
 205 {
 206         return task->mempolicy;
 207 }
 208 #else
 209 static inline bool task_has_mempolicy(struct task_struct *task)
 210 {
 211         return false;
 212 }
 213 #endif
 214
 215
 216 /* bits in struct cpuset flags field */
 217 typedef enum {
 218         CS_ONLINE,
 219         CS_CPU_EXCLUSIVE,
 220         CS_MEM_EXCLUSIVE,
 221         CS_MEM_HARDWALL,
 222         CS_MEMORY_MIGRATE,
 223         CS_SCHED_LOAD_BALANCE,
 224         CS_SPREAD_PAGE,
 225         CS_SPREAD_SLAB,
 226 } cpuset_flagbits_t;
 227
 228 /* convenient tests for these bits */
 229 static inline bool is_cpuset_online(struct cpuset *cs)
 230 {
 231         return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
 232 }
 233
 234 static inline int is_cpu_exclusive(const struct cpuset *cs)
 235 {
 236         return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
 237 }
 238
 239 static inline int is_mem_exclusive(const struct cpuset *cs)
 240 {
 241         return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
 242 }
 243
 244 static inline int is_mem_hardwall(const struct cpuset *cs)
 245 {
 246         return test_bit(CS_MEM_HARDWALL, &cs->flags);
 247 }
 248
 249 static inline int is_sched_load_balance(const struct cpuset *cs)
 250 {
 251         return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 252 }
 253
 254 static inline int is_memory_migrate(const struct cpuset *cs)
 255 {
 256         return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
 257 }
 258
 259 static inline int is_spread_page(const struct cpuset *cs)
 260 {
 261         return test_bit(CS_SPREAD_PAGE, &cs->flags);
 262 }
 263
 264 static inline int is_spread_slab(const struct cpuset *cs)
 265 {
 266         return test_bit(CS_SPREAD_SLAB, &cs->flags);
 267 }
 268
 269 static inline int is_partition_root(const struct cpuset *cs)
 270 {
 271         return cs->partition_root_state > 0;
 272 }
 273
 274 static struct cpuset top_cpuset = {
 275         .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
 276                   (1 << CS_MEM_EXCLUSIVE)),
 277         .partition_root_state = PRS_ENABLED,
 278 };
 279
 280 /**
 281  * cpuset_for_each_child - traverse online children of a cpuset
 282  * @child_cs: loop cursor pointing to the current child
 283  * @pos_css: used for iteration
 284  * @parent_cs: target cpuset to walk children of
 285  *
 286  * Walk @child_cs through the online children of @parent_cs.  Must be used
 287  * with RCU read locked.
 288  */
 289 #define cpuset_for_each_child(child_cs, pos_css, parent_cs)             \
 290         css_for_each_child((pos_css), &(parent_cs)->css)                \
 291                 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
 292
 293 /**
 294  * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
 295  * @des_cs: loop cursor pointing to the current descendant
 296  * @pos_css: used for iteration
 297  * @root_cs: target cpuset to walk ancestor of
 298  *
 299  * Walk @des_cs through the online descendants of @root_cs.  Must be used
 300  * with RCU read locked.  The caller may modify @pos_css by calling
 301  * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
 302  * iteration and the first node to be visited.
 303  */
 304 #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)        \
 305         css_for_each_descendant_pre((pos_css), &(root_cs)->css)         \
 306                 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
 307
 308 /*
 309  * There are two global locks guarding cpuset structures - cpuset_mutex and
 310  * callback_lock. We also require taking task_lock() when dereferencing a
 311  * task's cpuset pointer. See "The task_lock() exception", at the end of this
 312  * comment.
 313  *
 314  * A task must hold both locks to modify cpusets.  If a task holds
 315  * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
 316  * is the only task able to also acquire callback_lock and be able to
 317  * modify cpusets.  It can perform various checks on the cpuset structure
 318  * first, knowing nothing will change.  It can also allocate memory while
 319  * just holding cpuset_mutex.  While it is performing these checks, various
 320  * callback routines can briefly acquire callback_lock to query cpusets.
 321  * Once it is ready to make the changes, it takes callback_lock, blocking
 322  * everyone else.
 323  *
 324  * Calls to the kernel memory allocator can not be made while holding
 325  * callback_lock, as that would risk double tripping on callback_lock
 326  * from one of the callbacks into the cpuset code from within
 327  * __alloc_pages().
 328  *
 329  * If a task is only holding callback_lock, then it has read-only
 330  * access to cpusets.
 331  *
 332  * Now, the task_struct fields mems_allowed and mempolicy may be changed
 333  * by other task, we use alloc_lock in the task_struct fields to protect
 334  * them.
 335  *
 336  * The cpuset_common_file_read() handlers only hold callback_lock across
 337  * small pieces of code, such as when reading out possibly multi-word
 338  * cpumasks and nodemasks.
 339  *
 340  * Accessing a task's cpuset should be done in accordance with the
 341  * guidelines for accessing subsystem state in kernel/cgroup.c
 342  */
 343
 344 static DEFINE_MUTEX(cpuset_mutex);
 345 static DEFINE_SPINLOCK(callback_lock);
 346
 347 static struct workqueue_struct *cpuset_migrate_mm_wq;
 348
 349 /*
 350  * CPU / memory hotplug is handled asynchronously.
 351  */
 352 static void cpuset_hotplug_workfn(struct work_struct *work);
 353 static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
 354
 355 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
 356
 357 /*
 358  * Cgroup v2 behavior is used when on default hierarchy or the
 359  * cgroup_v2_mode flag is set.
 360  */
 361 static inline bool is_in_v2_mode(void)
 362 {
 363         return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
 364               (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
 365 }
 366
 367 /*
 368  * This is ugly, but preserves the userspace API for existing cpuset
 369  * users. If someone tries to mount the "cpuset" filesystem, we
 370  * silently switch it to mount "cgroup" instead
 371  */
 372 static struct dentry *cpuset_mount(struct file_system_type *fs_type,
 373                          int flags, const char *unused_dev_name, void *data)
 374 {
 375         struct file_system_type *cgroup_fs = get_fs_type("cgroup");
 376         struct dentry *ret = ERR_PTR(-ENODEV);
 377         if (cgroup_fs) {
 378                 char mountopts[] =
 379                         "cpuset,noprefix,"
 380                         "release_agent=/sbin/cpuset_release_agent";
 381                 ret = cgroup_fs->mount(cgroup_fs, flags,
 382                                            unused_dev_name, mountopts);
 383                 put_filesystem(cgroup_fs);
 384         }
 385         return ret;
 386 }
 387
 388 static struct file_system_type cpuset_fs_type = {
 389         .name = "cpuset",
 390         .mount = cpuset_mount,
 391 };
 392
 393 /*
 394  * Return in pmask the portion of a cpusets's cpus_allowed that
 395  * are online.  If none are online, walk up the cpuset hierarchy
 396  * until we find one that does have some online cpus.
 397  *
 398  * One way or another, we guarantee to return some non-empty subset
 399  * of cpu_online_mask.
 400  *
 401  * Call with callback_lock or cpuset_mutex held.
 402  */
 403 static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
 404 {
 405         while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
 406                 cs = parent_cs(cs);
 407                 if (unlikely(!cs)) {
 408                         /*
 409                          * The top cpuset doesn't have any online cpu as a
 410                          * consequence of a race between cpuset_hotplug_work
 411                          * and cpu hotplug notifier.  But we know the top
 412                          * cpuset's effective_cpus is on its way to to be
 413                          * identical to cpu_online_mask.
 414                          */
 415                         cpumask_copy(pmask, cpu_online_mask);
 416                         return;
 417                 }
 418         }
 419         cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
 420 }
 421
 422 /*
 423  * Return in *pmask the portion of a cpusets's mems_allowed that
 424  * are online, with memory.  If none are online with memory, walk
 425  * up the cpuset hierarchy until we find one that does have some
 426  * online mems.  The top cpuset always has some mems online.
 427  *
 428  * One way or another, we guarantee to return some non-empty subset
 429  * of node_states[N_MEMORY].
 430  *
 431  * Call with callback_lock or cpuset_mutex held.
 432  */
 433 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
 434 {
 435         while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
 436                 cs = parent_cs(cs);
 437         nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
 438 }
 439
 440 /*
 441  * update task's spread flag if cpuset's page/slab spread flag is set
 442  *
 443  * Call with callback_lock or cpuset_mutex held.
 444  */
 445 static void cpuset_update_task_spread_flag(struct cpuset *cs,
 446                                         struct task_struct *tsk)
 447 {
 448         if (is_spread_page(cs))
 449                 task_set_spread_page(tsk);
 450         else
 451                 task_clear_spread_page(tsk);
 452
 453         if (is_spread_slab(cs))
 454                 task_set_spread_slab(tsk);
 455         else
 456                 task_clear_spread_slab(tsk);
 457 }
 458
 459 /*
 460  * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
 461  *
 462  * One cpuset is a subset of another if all its allowed CPUs and
 463  * Memory Nodes are a subset of the other, and its exclusive flags
 464  * are only set if the other's are set.  Call holding cpuset_mutex.
 465  */
 466
 467 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 468 {
 469         return  cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
 470                 nodes_subset(p->mems_allowed, q->mems_allowed) &&
 471                 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
 472                 is_mem_exclusive(p) <= is_mem_exclusive(q);
 473 }
 474
 475 /**
 476  * alloc_cpumasks - allocate three cpumasks for cpuset
 477  * @cs:  the cpuset that have cpumasks to be allocated.
 478  * @tmp: the tmpmasks structure pointer
 479  * Return: 0 if successful, -ENOMEM otherwise.
 480  *
 481  * Only one of the two input arguments should be non-NULL.
 482  */
 483 static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 484 {
 485         cpumask_var_t *pmask1, *pmask2, *pmask3;
 486
 487         if (cs) {
 488                 pmask1 = &cs->cpus_allowed;
 489                 pmask2 = &cs->effective_cpus;
 490                 pmask3 = &cs->subparts_cpus;
 491         } else {
 492                 pmask1 = &tmp->new_cpus;
 493                 pmask2 = &tmp->addmask;
 494                 pmask3 = &tmp->delmask;
 495         }
 496
 497         if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
 498                 return -ENOMEM;
 499
 500         if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
 501                 goto free_one;
 502
 503         if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
 504                 goto free_two;
 505
 506         return 0;
 507
 508 free_two:
 509         free_cpumask_var(*pmask2);
 510 free_one:
 511         free_cpumask_var(*pmask1);
 512         return -ENOMEM;
 513 }
 514
 515 /**
 516  * free_cpumasks - free cpumasks in a tmpmasks structure
 517  * @cs:  the cpuset that have cpumasks to be free.
 518  * @tmp: the tmpmasks structure pointer
 519  */
 520 static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 521 {
 522         if (cs) {
 523                 free_cpumask_var(cs->cpus_allowed);
 524                 free_cpumask_var(cs->effective_cpus);
 525                 free_cpumask_var(cs->subparts_cpus);
 526         }
 527         if (tmp) {
 528                 free_cpumask_var(tmp->new_cpus);
 529                 free_cpumask_var(tmp->addmask);
 530                 free_cpumask_var(tmp->delmask);
 531         }
 532 }
 533
 534 /**
 535  * alloc_trial_cpuset - allocate a trial cpuset
 536  * @cs: the cpuset that the trial cpuset duplicates
 537  */
 538 static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
 539 {
 540         struct cpuset *trial;
 541
 542         trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
 543         if (!trial)
 544                 return NULL;
 545
 546         if (alloc_cpumasks(trial, NULL)) {
 547                 kfree(trial);
 548                 return NULL;
 549         }
 550
 551         cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
 552         cpumask_copy(trial->effective_cpus, cs->effective_cpus);
 553         return trial;
 554 }
 555
 556 /**
 557  * free_cpuset - free the cpuset
 558  * @cs: the cpuset to be freed
 559  */
 560 static inline void free_cpuset(struct cpuset *cs)
 561 {
 562         free_cpumasks(cs, NULL);
 563         kfree(cs);
 564 }
 565
 566 /*
 567  * validate_change() - Used to validate that any proposed cpuset change
 568  *                     follows the structural rules for cpusets.
 569  *
 570  * If we replaced the flag and mask values of the current cpuset
 571  * (cur) with those values in the trial cpuset (trial), would
 572  * our various subset and exclusive rules still be valid?  Presumes
 573  * cpuset_mutex held.
 574  *
 575  * 'cur' is the address of an actual, in-use cpuset.  Operations
 576  * such as list traversal that depend on the actual address of the
 577  * cpuset in the list must use cur below, not trial.
 578  *
 579  * 'trial' is the address of bulk structure copy of cur, with
 580  * perhaps one or more of the fields cpus_allowed, mems_allowed,
 581  * or flags changed to new, trial values.
 582  *
 583  * Return 0 if valid, -errno if not.
 584  */
 585
 586 static int validate_change(struct cpuset *cur, struct cpuset *trial)
 587 {
 588         struct cgroup_subsys_state *css;
 589         struct cpuset *c, *par;
 590         int ret;
 591
 592         rcu_read_lock();
 593
 594         /* Each of our child cpusets must be a subset of us */
 595         ret = -EBUSY;
 596         cpuset_for_each_child(c, css, cur)
 597                 if (!is_cpuset_subset(c, trial))
 598                         goto out;
 599
 600         /* Remaining checks don't apply to root cpuset */
 601         ret = 0;
 602         if (cur == &top_cpuset)
 603                 goto out;
 604
 605         par = parent_cs(cur);
 606
 607         /* On legacy hiearchy, we must be a subset of our parent cpuset. */
 608         ret = -EACCES;
 609         if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
 610                 goto out;
 611
 612         /*
 613          * If either I or some sibling (!= me) is exclusive, we can't
 614          * overlap
 615          */
 616         ret = -EINVAL;
 617         cpuset_for_each_child(c, css, par) {
 618                 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
 619                     c != cur &&
 620                     cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
 621                         goto out;
 622                 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
 623                     c != cur &&
 624                     nodes_intersects(trial->mems_allowed, c->mems_allowed))
 625                         goto out;
 626         }
 627
 628         /*
 629          * Cpusets with tasks - existing or newly being attached - can't
 630          * be changed to have empty cpus_allowed or mems_allowed.
 631          */
 632         ret = -ENOSPC;
 633         if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
 634                 if (!cpumask_empty(cur->cpus_allowed) &&
 635                     cpumask_empty(trial->cpus_allowed))
 636                         goto out;
 637                 if (!nodes_empty(cur->mems_allowed) &&
 638                     nodes_empty(trial->mems_allowed))
 639                         goto out;
 640         }
 641
 642         /*
 643          * We can't shrink if we won't have enough room for SCHED_DEADLINE
 644          * tasks.
 645          */
 646         ret = -EBUSY;
 647         if (is_cpu_exclusive(cur) &&
 648             !cpuset_cpumask_can_shrink(cur->cpus_allowed,
 649                                        trial->cpus_allowed))
 650                 goto out;
 651
 652         ret = 0;
 653 out:
 654         rcu_read_unlock();
 655         return ret;
 656 }
 657
 658 #ifdef CONFIG_SMP
 659 /*
 660  * Helper routine for generate_sched_domains().
 661  * Do cpusets a, b have overlapping effective cpus_allowed masks?
 662  */
 663 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 664 {
 665         return cpumask_intersects(a->effective_cpus, b->effective_cpus);
 666 }
 667
 668 static void
 669 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 670 {
 671         if (dattr->relax_domain_level < c->relax_domain_level)
 672                 dattr->relax_domain_level = c->relax_domain_level;
 673         return;
 674 }
 675
 676 static void update_domain_attr_tree(struct sched_domain_attr *dattr,
 677                                     struct cpuset *root_cs)
 678 {
 679         struct cpuset *cp;
 680         struct cgroup_subsys_state *pos_css;
 681
 682         rcu_read_lock();
 683         cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
 684                 /* skip the whole subtree if @cp doesn't have any CPU */
 685                 if (cpumask_empty(cp->cpus_allowed)) {
 686                         pos_css = css_rightmost_descendant(pos_css);
 687                         continue;
 688                 }
 689
 690                 if (is_sched_load_balance(cp))
 691                         update_domain_attr(dattr, cp);
 692         }
 693         rcu_read_unlock();
 694 }
 695
 696 /* Must be called with cpuset_mutex held.  */
 697 static inline int nr_cpusets(void)
 698 {
 699         /* jump label reference count + the top-level cpuset */
 700         return static_key_count(&cpusets_enabled_key.key) + 1;
 701 }
 702
 703 /*
 704  * generate_sched_domains()
 705  *
 706  * This function builds a partial partition of the systems CPUs
 707  * A 'partial partition' is a set of non-overlapping subsets whose
 708  * union is a subset of that set.
 709  * The output of this function needs to be passed to kernel/sched/core.c
 710  * partition_sched_domains() routine, which will rebuild the scheduler's
 711  * load balancing domains (sched domains) as specified by that partial
 712  * partition.
 713  *
 714  * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt
 715  * for a background explanation of this.
 716  *
 717  * Does not return errors, on the theory that the callers of this
 718  * routine would rather not worry about failures to rebuild sched
 719  * domains when operating in the severe memory shortage situations
 720  * that could cause allocation failures below.
 721  *
 722  * Must be called with cpuset_mutex held.
 723  *
 724  * The three key local variables below are:
 725  *    q  - a linked-list queue of cpuset pointers, used to implement a
 726  *         top-down scan of all cpusets.  This scan loads a pointer
 727  *         to each cpuset marked is_sched_load_balance into the
 728  *         array 'csa'.  For our purposes, rebuilding the schedulers
 729  *         sched domains, we can ignore !is_sched_load_balance cpusets.
 730  *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
 731  *         that need to be load balanced, for convenient iterative
 732  *         access by the subsequent code that finds the best partition,
 733  *         i.e the set of domains (subsets) of CPUs such that the
 734  *         cpus_allowed of every cpuset marked is_sched_load_balance
 735  *         is a subset of one of these domains, while there are as
 736  *         many such domains as possible, each as small as possible.
 737  * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
 738  *         the kernel/sched/core.c routine partition_sched_domains() in a
 739  *         convenient format, that can be easily compared to the prior
 740  *         value to determine what partition elements (sched domains)
 741  *         were changed (added or removed.)
 742  *
 743  * Finding the best partition (set of domains):
 744  *      The triple nested loops below over i, j, k scan over the
 745  *      load balanced cpusets (using the array of cpuset pointers in
 746  *      csa[]) looking for pairs of cpusets that have overlapping
 747  *      cpus_allowed, but which don't have the same 'pn' partition
 748  *      number and gives them in the same partition number.  It keeps
 749  *      looping on the 'restart' label until it can no longer find
 750  *      any such pairs.
 751  *
 752  *      The union of the cpus_allowed masks from the set of
 753  *      all cpusets having the same 'pn' value then form the one
 754  *      element of the partition (one sched domain) to be passed to
 755  *      partition_sched_domains().
 756  */
 757 static int generate_sched_domains(cpumask_var_t **domains,
 758                         struct sched_domain_attr **attributes)
 759 {
 760         struct cpuset *cp;      /* scans q */
 761         struct cpuset **csa;    /* array of all cpuset ptrs */
 762         int csn;                /* how many cpuset ptrs in csa so far */
 763         int i, j, k;            /* indices for partition finding loops */
 764         cpumask_var_t *doms;    /* resulting partition; i.e. sched domains */
 765         struct sched_domain_attr *dattr;  /* attributes for custom domains */
 766         int ndoms = 0;          /* number of sched domains in result */
 767         int nslot;              /* next empty doms[] struct cpumask slot */
 768         struct cgroup_subsys_state *pos_css;
 769
 770         doms = NULL;
 771         dattr = NULL;
 772         csa = NULL;
 773
 774         /* Special case for the 99% of systems with one, full, sched domain */
 775         if (is_sched_load_balance(&top_cpuset)) {
 776                 ndoms = 1;
 777                 doms = alloc_sched_domains(ndoms);
 778                 if (!doms)
 779                         goto done;
 780
 781                 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
 782                 if (dattr) {
 783                         *dattr = SD_ATTR_INIT;
 784                         update_domain_attr_tree(dattr, &top_cpuset);
 785                 }
 786                 cpumask_and(doms[0], top_cpuset.effective_cpus,
 787                             housekeeping_cpumask(HK_FLAG_DOMAIN));
 788
 789                 goto done;
 790         }
 791
 792         csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
 793         if (!csa)
 794                 goto done;
 795         csn = 0;
 796
 797         rcu_read_lock();
 798         cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
 799                 if (cp == &top_cpuset)
 800                         continue;
 801                 /*
 802                  * Continue traversing beyond @cp iff @cp has some CPUs and
 803                  * isn't load balancing.  The former is obvious.  The
 804                  * latter: All child cpusets contain a subset of the
 805                  * parent's cpus, so just skip them, and then we call
 806                  * update_domain_attr_tree() to calc relax_domain_level of
 807                  * the corresponding sched domain.
 808                  */
 809                 if (!cpumask_empty(cp->cpus_allowed) &&
 810                     !(is_sched_load_balance(cp) &&
 811                       cpumask_intersects(cp->cpus_allowed,
 812                                          housekeeping_cpumask(HK_FLAG_DOMAIN))))
 813                         continue;
 814
 815                 if (is_sched_load_balance(cp))
 816                         csa[csn++] = cp;
 817
 818                 /* skip @cp's subtree */
 819                 pos_css = css_rightmost_descendant(pos_css);
 820         }
 821         rcu_read_unlock();
 822
 823         for (i = 0; i < csn; i++)
 824                 csa[i]->pn = i;
 825         ndoms = csn;
 826
 827 restart:
 828         /* Find the best partition (set of sched domains) */
 829         for (i = 0; i < csn; i++) {
 830                 struct cpuset *a = csa[i];
 831                 int apn = a->pn;
 832
 833                 for (j = 0; j < csn; j++) {
 834                         struct cpuset *b = csa[j];
 835                         int bpn = b->pn;
 836
 837                         if (apn != bpn && cpusets_overlap(a, b)) {
 838                                 for (k = 0; k < csn; k++) {
 839                                         struct cpuset *c = csa[k];
 840
 841                                         if (c->pn == bpn)
 842                                                 c->pn = apn;
 843                                 }
 844                                 ndoms--;        /* one less element */
 845                                 goto restart;
 846                         }
 847                 }
 848         }
 849
 850         /*
 851          * Now we know how many domains to create.
 852          * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
 853          */
 854         doms = alloc_sched_domains(ndoms);
 855         if (!doms)
 856                 goto done;
 857
 858         /*
 859          * The rest of the code, including the scheduler, can deal with
 860          * dattr==NULL case. No need to abort if alloc fails.
 861          */
 862         dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
 863                               GFP_KERNEL);
 864
 865         for (nslot = 0, i = 0; i < csn; i++) {
 866                 struct cpuset *a = csa[i];
 867                 struct cpumask *dp;
 868                 int apn = a->pn;
 869
 870                 if (apn < 0) {
 871                         /* Skip completed partitions */
 872                         continue;
 873                 }
 874
 875                 dp = doms[nslot];
 876
 877                 if (nslot == ndoms) {
 878                         static int warnings = 10;
 879                         if (warnings) {
 880                                 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
 881                                         nslot, ndoms, csn, i, apn);
 882                                 warnings--;
 883                         }
 884                         continue;
 885                 }
 886
 887                 cpumask_clear(dp);
 888                 if (dattr)
 889                         *(dattr + nslot) = SD_ATTR_INIT;
 890                 for (j = i; j < csn; j++) {
 891                         struct cpuset *b = csa[j];
 892
 893                         if (apn == b->pn) {
 894                                 cpumask_or(dp, dp, b->effective_cpus);
 895                                 cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
 896                                 if (dattr)
 897                                         update_domain_attr_tree(dattr + nslot, b);
 898
 899                                 /* Done with this partition */
 900                                 b->pn = -1;
 901                         }
 902                 }
 903                 nslot++;
 904         }
 905         BUG_ON(nslot != ndoms);
 906
 907 done:
 908         kfree(csa);
 909
 910         /*
 911          * Fallback to the default domain if kmalloc() failed.
 912          * See comments in partition_sched_domains().
 913          */
 914         if (doms == NULL)
 915                 ndoms = 1;
 916
 917         *domains    = doms;
 918         *attributes = dattr;
 919         return ndoms;
 920 }
 921
 922 /*
 923  * Rebuild scheduler domains.
 924  *
 925  * If the flag 'sched_load_balance' of any cpuset with non-empty
 926  * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
 927  * which has that flag enabled, or if any cpuset with a non-empty
 928  * 'cpus' is removed, then call this routine to rebuild the
 929  * scheduler's dynamic sched domains.
 930  *
 931  * Call with cpuset_mutex held.  Takes get_online_cpus().
 932  */
 933 static void rebuild_sched_domains_locked(void)
 934 {
 935         struct sched_domain_attr *attr;
 936         cpumask_var_t *doms;
 937         int ndoms;
 938
 939         lockdep_assert_held(&cpuset_mutex);
 940         get_online_cpus();
 941
 942         /*
 943          * We have raced with CPU hotplug. Don't do anything to avoid
 944          * passing doms with offlined cpu to partition_sched_domains().
 945          * Anyways, hotplug work item will rebuild sched domains.
 946          */
 947         if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
 948                 goto out;
 949
 950         /* Generate domain masks and attrs */
 951         ndoms = generate_sched_domains(&doms, &attr);
 952
 953         /* Have scheduler rebuild the domains */
 954         partition_sched_domains(ndoms, doms, attr);
 955 out:
 956         put_online_cpus();
 957 }
 958 #else /* !CONFIG_SMP */
 959 static void rebuild_sched_domains_locked(void)
 960 {
 961 }
 962 #endif /* CONFIG_SMP */
 963
 964 void rebuild_sched_domains(void)
 965 {
 966         mutex_lock(&cpuset_mutex);
 967         rebuild_sched_domains_locked();
 968         mutex_unlock(&cpuset_mutex);
 969 }
 970
 971 /**
 972  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
 973  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
 974  *
 975  * Iterate through each task of @cs updating its cpus_allowed to the
 976  * effective cpuset's.  As this function is called with cpuset_mutex held,
 977  * cpuset membership stays stable.
 978  */
 979 static void update_tasks_cpumask(struct cpuset *cs)
 980 {
 981         struct css_task_iter it;
 982         struct task_struct *task;
 983
 984         css_task_iter_start(&cs->css, 0, &it);
 985         while ((task = css_task_iter_next(&it)))
 986                 set_cpus_allowed_ptr(task, cs->effective_cpus);
 987         css_task_iter_end(&it);
 988 }
 989
 990 /**
 991  * compute_effective_cpumask - Compute the effective cpumask of the cpuset
 992  * @new_cpus: the temp variable for the new effective_cpus mask
 993  * @cs: the cpuset the need to recompute the new effective_cpus mask
 994  * @parent: the parent cpuset
 995  *
 996  * If the parent has subpartition CPUs, include them in the list of
 997  * allowable CPUs in computing the new effective_cpus mask.
 998  */
 999 static void compute_effective_cpumask(struct cpumask *new_cpus,
1000                                       struct cpuset *cs, struct cpuset *parent)
1001 {
1002         if (parent->nr_subparts_cpus) {
1003                 cpumask_or(new_cpus, parent->effective_cpus,
1004                            parent->subparts_cpus);
1005                 cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
1006         } else {
1007                 cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
1008         }
1009 }
1010
1011 /*
1012  * Commands for update_parent_subparts_cpumask
1013  */
1014 enum subparts_cmd {
1015         partcmd_enable,         /* Enable partition root         */
1016         partcmd_disable,        /* Disable partition root        */
1017         partcmd_update,         /* Update parent's subparts_cpus */
1018 };
1019
1020 /**
1021  * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
1022  * @cpuset:  The cpuset that requests change in partition root state
1023  * @cmd:     Partition root state change command
1024  * @newmask: Optional new cpumask for partcmd_update
1025  * @tmp:     Temporary addmask and delmask
1026  * Return:   0, 1 or an error code
1027  *
1028  * For partcmd_enable, the cpuset is being transformed from a non-partition
1029  * root to a partition root. The cpus_allowed mask of the given cpuset will
1030  * be put into parent's subparts_cpus and taken away from parent's
1031  * effective_cpus. The function will return 0 if all the CPUs listed in
1032  * cpus_allowed can be granted or an error code will be returned.
1033  *
1034  * For partcmd_disable, the cpuset is being transofrmed from a partition
1035  * root back to a non-partition root. any CPUs in cpus_allowed that are in
1036  * parent's subparts_cpus will be taken away from that cpumask and put back
1037  * into parent's effective_cpus. 0 should always be returned.
1038  *
1039  * For partcmd_update, if the optional newmask is specified, the cpu
1040  * list is to be changed from cpus_allowed to newmask. Otherwise,
1041  * cpus_allowed is assumed to remain the same. The cpuset should either
1042  * be a partition root or an invalid partition root. The partition root
1043  * state may change if newmask is NULL and none of the requested CPUs can
1044  * be granted by the parent. The function will return 1 if changes to
1045  * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
1046  * Error code should only be returned when newmask is non-NULL.
1047  *
1048  * The partcmd_enable and partcmd_disable commands are used by
1049  * update_prstate(). The partcmd_update command is used by
1050  * update_cpumasks_hier() with newmask NULL and update_cpumask() with
1051  * newmask set.
1052  *
1053  * The checking is more strict when enabling partition root than the
1054  * other two commands.
1055  *
1056  * Because of the implicit cpu exclusive nature of a partition root,
1057  * cpumask changes that violates the cpu exclusivity rule will not be
1058  * permitted when checked by validate_change(). The validate_change()
1059  * function will also prevent any changes to the cpu list if it is not
1060  * a superset of children's cpu lists.
1061  */
1062 static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
1063                                           struct cpumask *newmask,
1064                                           struct tmpmasks *tmp)
1065 {
1066         struct cpuset *parent = parent_cs(cpuset);
1067         int adding;     /* Moving cpus from effective_cpus to subparts_cpus */
1068         int deleting;   /* Moving cpus from subparts_cpus to effective_cpus */
1069         bool part_error = false;        /* Partition error? */
1070
1071         lockdep_assert_held(&cpuset_mutex);
1072
1073         /*
1074          * The parent must be a partition root.
1075          * The new cpumask, if present, or the current cpus_allowed must
1076          * not be empty.
1077          */
1078         if (!is_partition_root(parent) ||
1079            (newmask && cpumask_empty(newmask)) ||
1080            (!newmask && cpumask_empty(cpuset->cpus_allowed)))
1081                 return -EINVAL;
1082
1083         /*
1084          * Enabling/disabling partition root is not allowed if there are
1085          * online children.
1086          */
1087         if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
1088                 return -EBUSY;
1089
1090         /*
1091          * Enabling partition root is not allowed if not all the CPUs
1092          * can be granted from parent's effective_cpus or at least one
1093          * CPU will be left after that.
1094          */
1095         if ((cmd == partcmd_enable) &&
1096            (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
1097              cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
1098                 return -EINVAL;
1099
1100         /*
1101          * A cpumask update cannot make parent's effective_cpus become empty.
1102          */
1103         adding = deleting = false;
1104         if (cmd == partcmd_enable) {
1105                 cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
1106                 adding = true;
1107         } else if (cmd == partcmd_disable) {
1108                 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1109                                        parent->subparts_cpus);
1110         } else if (newmask) {
1111                 /*
1112                  * partcmd_update with newmask:
1113                  *
1114                  * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
1115                  * addmask = newmask & parent->effective_cpus
1116                  *                   & ~parent->subparts_cpus
1117                  */
1118                 cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
1119                 deleting = cpumask_and(tmp->delmask, tmp->delmask,
1120                                        parent->subparts_cpus);
1121
1122                 cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
1123                 adding = cpumask_andnot(tmp->addmask, tmp->addmask,
1124                                         parent->subparts_cpus);
1125                 /*
1126                  * Return error if the new effective_cpus could become empty.
1127                  */
1128                 if (adding && !deleting &&
1129                     cpumask_equal(parent->effective_cpus, tmp->addmask))
1130                         return -EINVAL;
1131         } else {
1132                 /*
1133                  * partcmd_update w/o newmask:
1134                  *
1135                  * addmask = cpus_allowed & parent->effectiveb_cpus
1136                  *
1137                  * Note that parent's subparts_cpus may have been
1138                  * pre-shrunk in case there is a change in the cpu list.
1139                  * So no deletion is needed.
1140                  */
1141                 adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
1142                                      parent->effective_cpus);
1143                 part_error = cpumask_equal(tmp->addmask,
1144                                            parent->effective_cpus);
1145         }
1146
1147         if (cmd == partcmd_update) {
1148                 int prev_prs = cpuset->partition_root_state;
1149
1150                 /*
1151                  * Check for possible transition between PRS_ENABLED
1152                  * and PRS_ERROR.
1153                  */
1154                 switch (cpuset->partition_root_state) {
1155                 case PRS_ENABLED:
1156                         if (part_error)
1157                                 cpuset->partition_root_state = PRS_ERROR;
1158                         break;
1159                 case PRS_ERROR:
1160                         if (!part_error)
1161                                 cpuset->partition_root_state = PRS_ENABLED;
1162                         break;
1163                 }
1164                 /*
1165                  * Set part_error if previously in invalid state.
1166                  */
1167                 part_error = (prev_prs == PRS_ERROR);
1168         }
1169
1170         if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
1171                 return 0;       /* Nothing need to be done */
1172
1173         if (cpuset->partition_root_state == PRS_ERROR) {
1174                 /*
1175                  * Remove all its cpus from parent's subparts_cpus.
1176                  */
1177                 adding = false;
1178                 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1179                                        parent->subparts_cpus);
1180         }
1181
1182         if (!adding && !deleting)
1183                 return 0;
1184
1185         /*
1186          * Change the parent's subparts_cpus.
1187          * Newly added CPUs will be removed from effective_cpus and
1188          * newly deleted ones will be added back to effective_cpus.
1189          */
1190         spin_lock_irq(&callback_lock);
1191         if (adding) {
1192                 cpumask_or(parent->subparts_cpus,
1193                            parent->subparts_cpus, tmp->addmask);
1194                 cpumask_andnot(parent->effective_cpus,
1195                                parent->effective_cpus, tmp->addmask);
1196         }
1197         if (deleting) {
1198                 cpumask_andnot(parent->subparts_cpus,
1199                                parent->subparts_cpus, tmp->delmask);
1200                 cpumask_or(parent->effective_cpus,
1201                            parent->effective_cpus, tmp->delmask);
1202         }
1203
1204         parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
1205         spin_unlock_irq(&callback_lock);
1206
1207         return cmd == partcmd_update;
1208 }
1209
1210 /*
1211  * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
1212  * @cs:  the cpuset to consider
1213  * @tmp: temp variables for calculating effective_cpus & partition setup
1214  *
1215  * When congifured cpumask is changed, the effective cpumasks of this cpuset
1216  * and all its descendants need to be updated.
1217  *
1218  * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
1219  *
1220  * Called with cpuset_mutex held
1221  */
1222 static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
1223 {
1224         struct cpuset *cp;
1225         struct cgroup_subsys_state *pos_css;
1226         bool need_rebuild_sched_domains = false;
1227
1228         rcu_read_lock();
1229         cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1230                 struct cpuset *parent = parent_cs(cp);
1231
1232                 compute_effective_cpumask(tmp->new_cpus, cp, parent);
1233
1234                 /*
1235                  * If it becomes empty, inherit the effective mask of the
1236                  * parent, which is guaranteed to have some CPUs.
1237                  */
1238                 if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
1239                         cpumask_copy(tmp->new_cpus, parent->effective_cpus);
1240                         if (!cp->use_parent_ecpus) {
1241                                 cp->use_parent_ecpus = true;
1242                                 parent->child_ecpus_count++;
1243                         }
1244                 } else if (cp->use_parent_ecpus) {
1245                         cp->use_parent_ecpus = false;
1246                         WARN_ON_ONCE(!parent->child_ecpus_count);
1247                         parent->child_ecpus_count--;
1248                 }
1249
1250                 /*
1251                  * Skip the whole subtree if the cpumask remains the same
1252                  * and has no partition root state.
1253                  */
1254                 if (!cp->partition_root_state &&
1255                     cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
1256                         pos_css = css_rightmost_descendant(pos_css);
1257                         continue;
1258                 }
1259
1260                 /*
1261                  * update_parent_subparts_cpumask() should have been called
1262                  * for cs already in update_cpumask(). We should also call
1263                  * update_tasks_cpumask() again for tasks in the parent
1264                  * cpuset if the parent's subparts_cpus changes.
1265                  */
1266                 if ((cp != cs) && cp->partition_root_state) {
1267                         switch (parent->partition_root_state) {
1268                         case PRS_DISABLED:
1269                                 /*
1270                                  * If parent is not a partition root or an
1271                                  * invalid partition root, clear the state
1272                                  * state and the CS_CPU_EXCLUSIVE flag.
1273                                  */
1274                                 WARN_ON_ONCE(cp->partition_root_state
1275                                              != PRS_ERROR);
1276                                 cp->partition_root_state = 0;
1277
1278                                 /*
1279                                  * clear_bit() is an atomic operation and
1280                                  * readers aren't interested in the state
1281                                  * of CS_CPU_EXCLUSIVE anyway. So we can
1282                                  * just update the flag without holding
1283                                  * the callback_lock.
1284                                  */
1285                                 clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
1286                                 break;
1287
1288                         case PRS_ENABLED:
1289                                 if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
1290                                         update_tasks_cpumask(parent);
1291                                 break;
1292
1293                         case PRS_ERROR:
1294                                 /*
1295                                  * When parent is invalid, it has to be too.
1296                                  */
1297                                 cp->partition_root_state = PRS_ERROR;
1298                                 if (cp->nr_subparts_cpus) {
1299                                         cp->nr_subparts_cpus = 0;
1300                                         cpumask_clear(cp->subparts_cpus);
1301                                 }
1302                                 break;
1303                         }
1304                 }
1305
1306                 if (!css_tryget_online(&cp->css))
1307                         continue;
1308                 rcu_read_unlock();
1309
1310                 spin_lock_irq(&callback_lock);
1311
1312                 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1313                 if (cp->nr_subparts_cpus &&
1314                    (cp->partition_root_state != PRS_ENABLED)) {
1315                         cp->nr_subparts_cpus = 0;
1316                         cpumask_clear(cp->subparts_cpus);
1317                 } else if (cp->nr_subparts_cpus) {
1318                         /*
1319                          * Make sure that effective_cpus & subparts_cpus
1320                          * are mutually exclusive.
1321                          *
1322                          * In the unlikely event that effective_cpus
1323                          * becomes empty. we clear cp->nr_subparts_cpus and
1324                          * let its child partition roots to compete for
1325                          * CPUs again.
1326                          */
1327                         cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
1328                                        cp->subparts_cpus);
1329                         if (cpumask_empty(cp->effective_cpus)) {
1330                                 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1331                                 cpumask_clear(cp->subparts_cpus);
1332                                 cp->nr_subparts_cpus = 0;
1333                         } else if (!cpumask_subset(cp->subparts_cpus,
1334                                                    tmp->new_cpus)) {
1335                                 cpumask_andnot(cp->subparts_cpus,
1336                                         cp->subparts_cpus, tmp->new_cpus);
1337                                 cp->nr_subparts_cpus
1338                                         = cpumask_weight(cp->subparts_cpus);
1339                         }
1340                 }
1341                 spin_unlock_irq(&callback_lock);
1342
1343                 WARN_ON(!is_in_v2_mode() &&
1344                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
1345
1346                 update_tasks_cpumask(cp);
1347
1348                 /*
1349                  * If the effective cpumask of any non-empty cpuset is changed,
1350                  * we need to rebuild sched domains.
1351                  */
1352                 if (!cpumask_empty(cp->cpus_allowed) &&
1353                     is_sched_load_balance(cp))
1354                         need_rebuild_sched_domains = true;
1355
1356                 rcu_read_lock();
1357                 css_put(&cp->css);
1358         }
1359         rcu_read_unlock();
1360
1361         if (need_rebuild_sched_domains)
1362                 rebuild_sched_domains_locked();
1363 }
1364
1365 /**
1366  * update_sibling_cpumasks - Update siblings cpumasks
1367  * @parent:  Parent cpuset
1368  * @cs:      Current cpuset
1369  * @tmp:     Temp variables
1370  */
1371 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
1372                                     struct tmpmasks *tmp)
1373 {
1374         struct cpuset *sibling;
1375         struct cgroup_subsys_state *pos_css;
1376
1377         /*
1378          * Check all its siblings and call update_cpumasks_hier()
1379          * if their use_parent_ecpus flag is set in order for them
1380          * to use the right effective_cpus value.
1381          */
1382         rcu_read_lock();
1383         cpuset_for_each_child(sibling, pos_css, parent) {
1384                 if (sibling == cs)
1385                         continue;
1386                 if (!sibling->use_parent_ecpus)
1387                         continue;
1388
1389                 update_cpumasks_hier(sibling, tmp);
1390         }
1391         rcu_read_unlock();
1392 }
1393
1394 /**
1395  * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
1396  * @cs: the cpuset to consider
1397  * @trialcs: trial cpuset
1398  * @buf: buffer of cpu numbers written to this cpuset
1399  */
1400 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
1401                           const char *buf)
1402 {
1403         int retval;
1404         struct tmpmasks tmp;
1405
1406         /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
1407         if (cs == &top_cpuset)
1408                 return -EACCES;
1409
1410         /*
1411          * An empty cpus_allowed is ok only if the cpuset has no tasks.
1412          * Since cpulist_parse() fails on an empty mask, we special case
1413          * that parsing.  The validate_change() call ensures that cpusets
1414          * with tasks have cpus.
1415          */
1416         if (!*buf) {
1417                 cpumask_clear(trialcs->cpus_allowed);
1418         } else {
1419                 retval = cpulist_parse(buf, trialcs->cpus_allowed);
1420                 if (retval < 0)
1421                         return retval;
1422
1423                 if (!cpumask_subset(trialcs->cpus_allowed,
1424                                     top_cpuset.cpus_allowed))
1425                         return -EINVAL;
1426         }
1427
1428         /* Nothing to do if the cpus didn't change */
1429         if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
1430                 return 0;
1431
1432         retval = validate_change(cs, trialcs);
1433         if (retval < 0)
1434                 return retval;
1435
1436 #ifdef CONFIG_CPUMASK_OFFSTACK
1437         /*
1438          * Use the cpumasks in trialcs for tmpmasks when they are pointers
1439          * to allocated cpumasks.
1440          */
1441         tmp.addmask  = trialcs->subparts_cpus;
1442         tmp.delmask  = trialcs->effective_cpus;
1443         tmp.new_cpus = trialcs->cpus_allowed;
1444 #endif
1445
1446         if (cs->partition_root_state) {
1447                 /* Cpumask of a partition root cannot be empty */
1448                 if (cpumask_empty(trialcs->cpus_allowed))
1449                         return -EINVAL;
1450                 if (update_parent_subparts_cpumask(cs, partcmd_update,
1451                                         trialcs->cpus_allowed, &tmp) < 0)
1452                         return -EINVAL;
1453         }
1454
1455         spin_lock_irq(&callback_lock);
1456         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
1457
1458         /*
1459          * Make sure that subparts_cpus is a subset of cpus_allowed.
1460          */
1461         if (cs->nr_subparts_cpus) {
1462                 cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
1463                                cs->cpus_allowed);
1464                 cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
1465         }
1466         spin_unlock_irq(&callback_lock);
1467
1468         update_cpumasks_hier(cs, &tmp);
1469
1470         if (cs->partition_root_state) {
1471                 struct cpuset *parent = parent_cs(cs);
1472
1473                 /*
1474                  * For partition root, update the cpumasks of sibling
1475                  * cpusets if they use parent's effective_cpus.
1476                  */
1477                 if (parent->child_ecpus_count)
1478                         update_sibling_cpumasks(parent, cs, &tmp);
1479         }
1480         return 0;
1481 }
1482
1483 /*
1484  * Migrate memory region from one set of nodes to another.  This is
1485  * performed asynchronously as it can be called from process migration path
1486  * holding locks involved in process management.  All mm migrations are
1487  * performed in the queued order and can be waited for by flushing
1488  * cpuset_migrate_mm_wq.
1489  */
1490
1491 struct cpuset_migrate_mm_work {
1492         struct work_struct      work;
1493         struct mm_struct        *mm;
1494         nodemask_t              from;
1495         nodemask_t              to;
1496 };
1497
1498 static void cpuset_migrate_mm_workfn(struct work_struct *work)
1499 {
1500         struct cpuset_migrate_mm_work *mwork =
1501                 container_of(work, struct cpuset_migrate_mm_work, work);
1502
1503         /* on a wq worker, no need to worry about %current's mems_allowed */
1504         do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
1505         mmput(mwork->mm);
1506         kfree(mwork);
1507 }
1508
1509 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1510                                                         const nodemask_t *to)
1511 {
1512         struct cpuset_migrate_mm_work *mwork;
1513
1514         mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
1515         if (mwork) {
1516                 mwork->mm = mm;
1517                 mwork->from = *from;
1518                 mwork->to = *to;
1519                 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
1520                 queue_work(cpuset_migrate_mm_wq, &mwork->work);
1521         } else {
1522                 mmput(mm);
1523         }
1524 }
1525
1526 static void cpuset_post_attach(void)
1527 {
1528         flush_workqueue(cpuset_migrate_mm_wq);
1529 }
1530
1531 /*
1532  * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
1533  * @tsk: the task to change
1534  * @newmems: new nodes that the task will be set
1535  *
1536  * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
1537  * and rebind an eventual tasks' mempolicy. If the task is allocating in
1538  * parallel, it might temporarily see an empty intersection, which results in
1539  * a seqlock check and retry before OOM or allocation failure.
1540  */
1541 static void cpuset_change_task_nodemask(struct task_struct *tsk,
1542                                         nodemask_t *newmems)
1543 {
1544         task_lock(tsk);
1545
1546         local_irq_disable();
1547         write_seqcount_begin(&tsk->mems_allowed_seq);
1548
1549         nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1550         mpol_rebind_task(tsk, newmems);
1551         tsk->mems_allowed = *newmems;
1552
1553         write_seqcount_end(&tsk->mems_allowed_seq);
1554         local_irq_enable();
1555
1556         task_unlock(tsk);
1557 }
1558
1559 static void *cpuset_being_rebound;
1560
1561 /**
1562  * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1563  * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1564  *
1565  * Iterate through each task of @cs updating its mems_allowed to the
1566  * effective cpuset's.  As this function is called with cpuset_mutex held,
1567  * cpuset membership stays stable.
1568  */
1569 static void update_tasks_nodemask(struct cpuset *cs)
1570 {
1571         static nodemask_t newmems;      /* protected by cpuset_mutex */
1572         struct css_task_iter it;
1573         struct task_struct *task;
1574
1575         cpuset_being_rebound = cs;              /* causes mpol_dup() rebind */
1576
1577         guarantee_online_mems(cs, &newmems);
1578
1579         /*
1580          * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
1581          * take while holding tasklist_lock.  Forks can happen - the
1582          * mpol_dup() cpuset_being_rebound check will catch such forks,
1583          * and rebind their vma mempolicies too.  Because we still hold
1584          * the global cpuset_mutex, we know that no other rebind effort
1585          * will be contending for the global variable cpuset_being_rebound.
1586          * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1587          * is idempotent.  Also migrate pages in each mm to new nodes.
1588          */
1589         css_task_iter_start(&cs->css, 0, &it);
1590         while ((task = css_task_iter_next(&it))) {
1591                 struct mm_struct *mm;
1592                 bool migrate;
1593
1594                 cpuset_change_task_nodemask(task, &newmems);
1595
1596                 mm = get_task_mm(task);
1597                 if (!mm)
1598                         continue;
1599
1600                 migrate = is_memory_migrate(cs);
1601
1602                 mpol_rebind_mm(mm, &cs->mems_allowed);
1603                 if (migrate)
1604                         cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1605                 else
1606                         mmput(mm);
1607         }
1608         css_task_iter_end(&it);
1609
1610         /*
1611          * All the tasks' nodemasks have been updated, update
1612          * cs->old_mems_allowed.
1613          */
1614         cs->old_mems_allowed = newmems;
1615
1616         /* We're done rebinding vmas to this cpuset's new mems_allowed. */
1617         cpuset_being_rebound = NULL;
1618 }
1619
1620 /*
1621  * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
1622  * @cs: the cpuset to consider
1623  * @new_mems: a temp variable for calculating new effective_mems
1624  *
1625  * When configured nodemask is changed, the effective nodemasks of this cpuset
1626  * and all its descendants need to be updated.
1627  *
1628  * On legacy hiearchy, effective_mems will be the same with mems_allowed.
1629  *
1630  * Called with cpuset_mutex held
1631  */
1632 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1633 {
1634         struct cpuset *cp;
1635         struct cgroup_subsys_state *pos_css;
1636
1637         rcu_read_lock();
1638         cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1639                 struct cpuset *parent = parent_cs(cp);
1640
1641                 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1642
1643                 /*
1644                  * If it becomes empty, inherit the effective mask of the
1645                  * parent, which is guaranteed to have some MEMs.
1646                  */
1647                 if (is_in_v2_mode() && nodes_empty(*new_mems))
1648                         *new_mems = parent->effective_mems;
1649
1650                 /* Skip the whole subtree if the nodemask remains the same. */
1651                 if (nodes_equal(*new_mems, cp->effective_mems)) {
1652                         pos_css = css_rightmost_descendant(pos_css);
1653                         continue;
1654                 }
1655
1656                 if (!css_tryget_online(&cp->css))
1657                         continue;
1658                 rcu_read_unlock();
1659
1660                 spin_lock_irq(&callback_lock);
1661                 cp->effective_mems = *new_mems;
1662                 spin_unlock_irq(&callback_lock);
1663
1664                 WARN_ON(!is_in_v2_mode() &&
1665                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
1666
1667                 update_tasks_nodemask(cp);
1668
1669                 rcu_read_lock();
1670                 css_put(&cp->css);
1671         }
1672         rcu_read_unlock();
1673 }
1674
1675 /*
1676  * Handle user request to change the 'mems' memory placement
1677  * of a cpuset.  Needs to validate the request, update the
1678  * cpusets mems_allowed, and for each task in the cpuset,
1679  * update mems_allowed and rebind task's mempolicy and any vma
1680  * mempolicies and if the cpuset is marked 'memory_migrate',
1681  * migrate the tasks pages to the new memory.
1682  *
1683  * Call with cpuset_mutex held. May take callback_lock during call.
1684  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1685  * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1686  * their mempolicies to the cpusets new mems_allowed.
1687  */
1688 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1689                            const char *buf)
1690 {
1691         int retval;
1692
1693         /*
1694          * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
1695          * it's read-only
1696          */
1697         if (cs == &top_cpuset) {
1698                 retval = -EACCES;
1699                 goto done;
1700         }
1701
1702         /*
1703          * An empty mems_allowed is ok iff there are no tasks in the cpuset.
1704          * Since nodelist_parse() fails on an empty mask, we special case
1705          * that parsing.  The validate_change() call ensures that cpusets
1706          * with tasks have memory.
1707          */
1708         if (!*buf) {
1709                 nodes_clear(trialcs->mems_allowed);
1710         } else {
1711                 retval = nodelist_parse(buf, trialcs->mems_allowed);
1712                 if (retval < 0)
1713                         goto done;
1714
1715                 if (!nodes_subset(trialcs->mems_allowed,
1716                                   top_cpuset.mems_allowed)) {
1717                         retval = -EINVAL;
1718                         goto done;
1719                 }
1720         }
1721
1722         if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1723                 retval = 0;             /* Too easy - nothing to do */
1724                 goto done;
1725         }
1726         retval = validate_change(cs, trialcs);
1727         if (retval < 0)
1728                 goto done;
1729
1730         spin_lock_irq(&callback_lock);
1731         cs->mems_allowed = trialcs->mems_allowed;
1732         spin_unlock_irq(&callback_lock);
1733
1734         /* use trialcs->mems_allowed as a temp variable */
1735         update_nodemasks_hier(cs, &trialcs->mems_allowed);
1736 done:
1737         return retval;
1738 }
1739
1740 bool current_cpuset_is_being_rebound(void)
1741 {
1742         bool ret;
1743
1744         rcu_read_lock();
1745         ret = task_cs(current) == cpuset_being_rebound;
1746         rcu_read_unlock();
1747
1748         return ret;
1749 }
1750
1751 static int update_relax_domain_level(struct cpuset *cs, s64 val)
1752 {
1753 #ifdef CONFIG_SMP
1754         if (val < -1 || val >= sched_domain_level_max)
1755                 return -EINVAL;
1756 #endif
1757
1758         if (val != cs->relax_domain_level) {
1759                 cs->relax_domain_level = val;
1760                 if (!cpumask_empty(cs->cpus_allowed) &&
1761                     is_sched_load_balance(cs))
1762                         rebuild_sched_domains_locked();
1763         }
1764
1765         return 0;
1766 }
1767
1768 /**
1769  * update_tasks_flags - update the spread flags of tasks in the cpuset.
1770  * @cs: the cpuset in which each task's spread flags needs to be changed
1771  *
1772  * Iterate through each task of @cs updating its spread flags.  As this
1773  * function is called with cpuset_mutex held, cpuset membership stays
1774  * stable.
1775  */
1776 static void update_tasks_flags(struct cpuset *cs)
1777 {
1778         struct css_task_iter it;
1779         struct task_struct *task;
1780
1781         css_task_iter_start(&cs->css, 0, &it);
1782         while ((task = css_task_iter_next(&it)))
1783                 cpuset_update_task_spread_flag(cs, task);
1784         css_task_iter_end(&it);
1785 }
1786
1787 /*
1788  * update_flag - read a 0 or a 1 in a file and update associated flag
1789  * bit:         the bit to update (see cpuset_flagbits_t)
1790  * cs:          the cpuset to update
1791  * turning_on:  whether the flag is being set or cleared
1792  *
1793  * Call with cpuset_mutex held.
1794  */
1795
1796 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1797                        int turning_on)
1798 {
1799         struct cpuset *trialcs;
1800         int balance_flag_changed;
1801         int spread_flag_changed;
1802         int err;
1803
1804         trialcs = alloc_trial_cpuset(cs);
1805         if (!trialcs)
1806                 return -ENOMEM;
1807
1808         if (turning_on)
1809                 set_bit(bit, &trialcs->flags);
1810         else
1811                 clear_bit(bit, &trialcs->flags);
1812
1813         err = validate_change(cs, trialcs);
1814         if (err < 0)
1815                 goto out;
1816
1817         balance_flag_changed = (is_sched_load_balance(cs) !=
1818                                 is_sched_load_balance(trialcs));
1819
1820         spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1821                         || (is_spread_page(cs) != is_spread_page(trialcs)));
1822
1823         spin_lock_irq(&callback_lock);
1824         cs->flags = trialcs->flags;
1825         spin_unlock_irq(&callback_lock);
1826
1827         if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1828                 rebuild_sched_domains_locked();
1829
1830         if (spread_flag_changed)
1831                 update_tasks_flags(cs);
1832 out:
1833         free_cpuset(trialcs);
1834         return err;
1835 }
1836
1837 /*
1838  * update_prstate - update partititon_root_state
1839  * cs:  the cpuset to update
1840  * val: 0 - disabled, 1 - enabled
1841  *
1842  * Call with cpuset_mutex held.
1843  */
1844 static int update_prstate(struct cpuset *cs, int val)
1845 {
1846         int err;
1847         struct cpuset *parent = parent_cs(cs);
1848         struct tmpmasks tmp;
1849
1850         if ((val != 0) && (val != 1))
1851                 return -EINVAL;
1852         if (val == cs->partition_root_state)
1853                 return 0;
1854
1855         /*
1856          * Cannot force a partial or invalid partition root to a full
1857          * partition root.
1858          */
1859         if (val && cs->partition_root_state)
1860                 return -EINVAL;
1861
1862         if (alloc_cpumasks(NULL, &tmp))
1863                 return -ENOMEM;
1864
1865         err = -EINVAL;
1866         if (!cs->partition_root_state) {
1867                 /*
1868                  * Turning on partition root requires setting the
1869                  * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
1870                  * cannot be NULL.
1871                  */
1872                 if (cpumask_empty(cs->cpus_allowed))
1873                         goto out;
1874
1875                 err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
1876                 if (err)
1877                         goto out;
1878
1879                 err = update_parent_subparts_cpumask(cs, partcmd_enable,
1880                                                      NULL, &tmp);
1881                 if (err) {
1882                         update_flag(CS_CPU_EXCLUSIVE, cs, 0);
1883                         goto out;
1884                 }
1885                 cs->partition_root_state = PRS_ENABLED;
1886         } else {
1887                 /*
1888                  * Turning off partition root will clear the
1889                  * CS_CPU_EXCLUSIVE bit.
1890                  */
1891                 if (cs->partition_root_state == PRS_ERROR) {
1892                         cs->partition_root_state = 0;
1893                         update_flag(CS_CPU_EXCLUSIVE, cs, 0);
1894                         err = 0;
1895                         goto out;
1896                 }
1897
1898                 err = update_parent_subparts_cpumask(cs, partcmd_disable,
1899                                                      NULL, &tmp);
1900                 if (err)
1901                         goto out;
1902
1903                 cs->partition_root_state = 0;
1904
1905                 /* Turning off CS_CPU_EXCLUSIVE will not return error */
1906                 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
1907         }
1908
1909         /*
1910          * Update cpumask of parent's tasks except when it is the top
1911          * cpuset as some system daemons cannot be mapped to other CPUs.
1912          */
1913         if (parent != &top_cpuset)
1914                 update_tasks_cpumask(parent);
1915
1916         if (parent->child_ecpus_count)
1917                 update_sibling_cpumasks(parent, cs, &tmp);
1918
1919         rebuild_sched_domains_locked();
1920 out:
1921         free_cpumasks(NULL, &tmp);
1922         return err;
1923 }
1924
1925 /*
1926  * Frequency meter - How fast is some event occurring?
1927  *
1928  * These routines manage a digitally filtered, constant time based,
1929  * event frequency meter.  There are four routines:
1930  *   fmeter_init() - initialize a frequency meter.
1931  *   fmeter_markevent() - called each time the event happens.
1932  *   fmeter_getrate() - returns the recent rate of such events.
1933  *   fmeter_update() - internal routine used to update fmeter.
1934  *
1935  * A common data structure is passed to each of these routines,
1936  * which is used to keep track of the state required to manage the
1937  * frequency meter and its digital filter.
1938  *
1939  * The filter works on the number of events marked per unit time.
1940  * The filter is single-pole low-pass recursive (IIR).  The time unit
1941  * is 1 second.  Arithmetic is done using 32-bit integers scaled to
1942  * simulate 3 decimal digits of precision (multiplied by 1000).
1943  *
1944  * With an FM_COEF of 933, and a time base of 1 second, the filter
1945  * has a half-life of 10 seconds, meaning that if the events quit
1946  * happening, then the rate returned from the fmeter_getrate()
1947  * will be cut in half each 10 seconds, until it converges to zero.
1948  *
1949  * It is not worth doing a real infinitely recursive filter.  If more
1950  * than FM_MAXTICKS ticks have elapsed since the last filter event,
1951  * just compute FM_MAXTICKS ticks worth, by which point the level
1952  * will be stable.
1953  *
1954  * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
1955  * arithmetic overflow in the fmeter_update() routine.
1956  *
1957  * Given the simple 32 bit integer arithmetic used, this meter works
1958  * best for reporting rates between one per millisecond (msec) and
1959  * one per 32 (approx) seconds.  At constant rates faster than one
1960  * per msec it maxes out at values just under 1,000,000.  At constant
1961  * rates between one per msec, and one per second it will stabilize
1962  * to a value N*1000, where N is the rate of events per second.
1963  * At constant rates between one per second and one per 32 seconds,
1964  * it will be choppy, moving up on the seconds that have an event,
1965  * and then decaying until the next event.  At rates slower than
1966  * about one in 32 seconds, it decays all the way back to zero between
1967  * each event.
1968  */
1969
1970 #define FM_COEF 933             /* coefficient for half-life of 10 secs */
1971 #define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
1972 #define FM_MAXCNT 1000000       /* limit cnt to avoid overflow */
1973 #define FM_SCALE 1000           /* faux fixed point scale */
1974
1975 /* Initialize a frequency meter */
1976 static void fmeter_init(struct fmeter *fmp)
1977 {
1978         fmp->cnt = 0;
1979         fmp->val = 0;
1980         fmp->time = 0;
1981         spin_lock_init(&fmp->lock);
1982 }
1983
1984 /* Internal meter update - process cnt events and update value */
1985 static void fmeter_update(struct fmeter *fmp)
1986 {
1987         time64_t now;
1988         u32 ticks;
1989
1990         now = ktime_get_seconds();
1991         ticks = now - fmp->time;
1992
1993         if (ticks == 0)
1994                 return;
1995
1996         ticks = min(FM_MAXTICKS, ticks);
1997         while (ticks-- > 0)
1998                 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1999         fmp->time = now;
2000
2001         fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
2002         fmp->cnt = 0;
2003 }
2004
2005 /* Process any previous ticks, then bump cnt by one (times scale). */
2006 static void fmeter_markevent(struct fmeter *fmp)
2007 {
2008         spin_lock(&fmp->lock);
2009         fmeter_update(fmp);
2010         fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
2011         spin_unlock(&fmp->lock);
2012 }
2013
2014 /* Process any previous ticks, then return current value. */
2015 static int fmeter_getrate(struct fmeter *fmp)
2016 {
2017         int val;
2018
2019         spin_lock(&fmp->lock);
2020         fmeter_update(fmp);
2021         val = fmp->val;
2022         spin_unlock(&fmp->lock);
2023         return val;
2024 }
2025
2026 static struct cpuset *cpuset_attach_old_cs;
2027
2028 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
2029 static int cpuset_can_attach(struct cgroup_taskset *tset)
2030 {
2031         struct cgroup_subsys_state *css;
2032         struct cpuset *cs;
2033         struct task_struct *task;
2034         int ret;
2035
2036         /* used later by cpuset_attach() */
2037         cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
2038         cs = css_cs(css);
2039
2040         mutex_lock(&cpuset_mutex);
2041
2042         /* allow moving tasks into an empty cpuset if on default hierarchy */
2043         ret = -ENOSPC;
2044         if (!is_in_v2_mode() &&
2045             (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
2046                 goto out_unlock;
2047
2048         cgroup_taskset_for_each(task, css, tset) {
2049                 ret = task_can_attach(task, cs->cpus_allowed);
2050                 if (ret)
2051                         goto out_unlock;
2052                 ret = security_task_setscheduler(task);
2053                 if (ret)
2054                         goto out_unlock;
2055         }
2056
2057         /*
2058          * Mark attach is in progress.  This makes validate_change() fail
2059          * changes which zero cpus/mems_allowed.
2060          */
2061         cs->attach_in_progress++;
2062         ret = 0;
2063 out_unlock:
2064         mutex_unlock(&cpuset_mutex);
2065         return ret;
2066 }
2067
2068 static void cpuset_cancel_attach(struct cgroup_taskset *tset)
2069 {
2070         struct cgroup_subsys_state *css;
2071         struct cpuset *cs;
2072
2073         cgroup_taskset_first(tset, &css);
2074         cs = css_cs(css);
2075
2076         mutex_lock(&cpuset_mutex);
2077         css_cs(css)->attach_in_progress--;
2078         mutex_unlock(&cpuset_mutex);
2079 }
2080
2081 /*
2082  * Protected by cpuset_mutex.  cpus_attach is used only by cpuset_attach()
2083  * but we can't allocate it dynamically there.  Define it global and
2084  * allocate from cpuset_init().
2085  */
2086 static cpumask_var_t cpus_attach;
2087
2088 static void cpuset_attach(struct cgroup_taskset *tset)
2089 {
2090         /* static buf protected by cpuset_mutex */
2091         static nodemask_t cpuset_attach_nodemask_to;
2092         struct task_struct *task;
2093         struct task_struct *leader;
2094         struct cgroup_subsys_state *css;
2095         struct cpuset *cs;
2096         struct cpuset *oldcs = cpuset_attach_old_cs;
2097
2098         cgroup_taskset_first(tset, &css);
2099         cs = css_cs(css);
2100
2101         mutex_lock(&cpuset_mutex);
2102
2103         /* prepare for attach */
2104         if (cs == &top_cpuset)
2105                 cpumask_copy(cpus_attach, cpu_possible_mask);
2106         else
2107                 guarantee_online_cpus(cs, cpus_attach);
2108
2109         guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
2110
2111         cgroup_taskset_for_each(task, css, tset) {
2112                 /*
2113                  * can_attach beforehand should guarantee that this doesn't
2114                  * fail.  TODO: have a better way to handle failure here
2115                  */
2116                 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
2117
2118                 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
2119                 cpuset_update_task_spread_flag(cs, task);
2120         }
2121
2122         /*
2123          * Change mm for all threadgroup leaders. This is expensive and may
2124          * sleep and should be moved outside migration path proper.
2125          */
2126         cpuset_attach_nodemask_to = cs->effective_mems;
2127         cgroup_taskset_for_each_leader(leader, css, tset) {
2128                 struct mm_struct *mm = get_task_mm(leader);
2129
2130                 if (mm) {
2131                         mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
2132
2133                         /*
2134                          * old_mems_allowed is the same with mems_allowed
2135                          * here, except if this task is being moved
2136                          * automatically due to hotplug.  In that case
2137                          * @mems_allowed has been updated and is empty, so
2138                          * @old_mems_allowed is the right nodesets that we
2139                          * migrate mm from.
2140                          */
2141                         if (is_memory_migrate(cs))
2142                                 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
2143                                                   &cpuset_attach_nodemask_to);
2144                         else
2145                                 mmput(mm);
2146                 }
2147         }
2148
2149         cs->old_mems_allowed = cpuset_attach_nodemask_to;
2150
2151         cs->attach_in_progress--;
2152         if (!cs->attach_in_progress)
2153                 wake_up(&cpuset_attach_wq);
2154
2155         mutex_unlock(&cpuset_mutex);
2156 }
2157
2158 /* The various types of files and directories in a cpuset file system */
2159
2160 typedef enum {
2161         FILE_MEMORY_MIGRATE,
2162         FILE_CPULIST,
2163         FILE_MEMLIST,
2164         FILE_EFFECTIVE_CPULIST,
2165         FILE_EFFECTIVE_MEMLIST,
2166         FILE_CPU_EXCLUSIVE,
2167         FILE_MEM_EXCLUSIVE,
2168         FILE_MEM_HARDWALL,
2169         FILE_SCHED_LOAD_BALANCE,
2170         FILE_PARTITION_ROOT,
2171         FILE_SCHED_RELAX_DOMAIN_LEVEL,
2172         FILE_MEMORY_PRESSURE_ENABLED,
2173         FILE_MEMORY_PRESSURE,
2174         FILE_SPREAD_PAGE,
2175         FILE_SPREAD_SLAB,
2176 } cpuset_filetype_t;
2177
2178 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
2179                             u64 val)
2180 {
2181         struct cpuset *cs = css_cs(css);
2182         cpuset_filetype_t type = cft->private;
2183         int retval = 0;
2184
2185         mutex_lock(&cpuset_mutex);
2186         if (!is_cpuset_online(cs)) {
2187                 retval = -ENODEV;
2188                 goto out_unlock;
2189         }
2190
2191         switch (type) {
2192         case FILE_CPU_EXCLUSIVE:
2193                 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
2194                 break;
2195         case FILE_MEM_EXCLUSIVE:
2196                 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
2197                 break;
2198         case FILE_MEM_HARDWALL:
2199                 retval = update_flag(CS_MEM_HARDWALL, cs, val);
2200                 break;
2201         case FILE_SCHED_LOAD_BALANCE:
2202                 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
2203                 break;
2204         case FILE_MEMORY_MIGRATE:
2205                 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
2206                 break;
2207         case FILE_MEMORY_PRESSURE_ENABLED:
2208                 cpuset_memory_pressure_enabled = !!val;
2209                 break;
2210         case FILE_SPREAD_PAGE:
2211                 retval = update_flag(CS_SPREAD_PAGE, cs, val);
2212                 break;
2213         case FILE_SPREAD_SLAB:
2214                 retval = update_flag(CS_SPREAD_SLAB, cs, val);
2215                 break;
2216         default:
2217                 retval = -EINVAL;
2218                 break;
2219         }
2220 out_unlock:
2221         mutex_unlock(&cpuset_mutex);
2222         return retval;
2223 }
2224
2225 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
2226                             s64 val)
2227 {
2228         struct cpuset *cs = css_cs(css);
2229         cpuset_filetype_t type = cft->private;
2230         int retval = -ENODEV;
2231
2232         mutex_lock(&cpuset_mutex);
2233         if (!is_cpuset_online(cs))
2234                 goto out_unlock;
2235
2236         switch (type) {
2237         case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2238                 retval = update_relax_domain_level(cs, val);
2239                 break;
2240         case FILE_PARTITION_ROOT:
2241                 retval = update_prstate(cs, val);
2242                 break;
2243         default:
2244                 retval = -EINVAL;
2245                 break;
2246         }
2247 out_unlock:
2248         mutex_unlock(&cpuset_mutex);
2249         return retval;
2250 }
2251
2252 /*
2253  * Common handling for a write to a "cpus" or "mems" file.
2254  */
2255 static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
2256                                     char *buf, size_t nbytes, loff_t off)
2257 {
2258         struct cpuset *cs = css_cs(of_css(of));
2259         struct cpuset *trialcs;
2260         int retval = -ENODEV;
2261
2262         buf = strstrip(buf);
2263
2264         /*
2265          * CPU or memory hotunplug may leave @cs w/o any execution
2266          * resources, in which case the hotplug code asynchronously updates
2267          * configuration and transfers all tasks to the nearest ancestor
2268          * which can execute.
2269          *
2270          * As writes to "cpus" or "mems" may restore @cs's execution
2271          * resources, wait for the previously scheduled operations before
2272          * proceeding, so that we don't end up keep removing tasks added
2273          * after execution capability is restored.
2274          *
2275          * cpuset_hotplug_work calls back into cgroup core via
2276          * cgroup_transfer_tasks() and waiting for it from a cgroupfs
2277          * operation like this one can lead to a deadlock through kernfs
2278          * active_ref protection.  Let's break the protection.  Losing the
2279          * protection is okay as we check whether @cs is online after
2280          * grabbing cpuset_mutex anyway.  This only happens on the legacy
2281          * hierarchies.
2282          */
2283         css_get(&cs->css);
2284         kernfs_break_active_protection(of->kn);
2285         flush_work(&cpuset_hotplug_work);
2286
2287         mutex_lock(&cpuset_mutex);
2288         if (!is_cpuset_online(cs))
2289                 goto out_unlock;
2290
2291         trialcs = alloc_trial_cpuset(cs);
2292         if (!trialcs) {
2293                 retval = -ENOMEM;
2294                 goto out_unlock;
2295         }
2296
2297         switch (of_cft(of)->private) {
2298         case FILE_CPULIST:
2299                 retval = update_cpumask(cs, trialcs, buf);
2300                 break;
2301         case FILE_MEMLIST:
2302                 retval = update_nodemask(cs, trialcs, buf);
2303                 break;
2304         default:
2305                 retval = -EINVAL;
2306                 break;
2307         }
2308
2309         free_cpuset(trialcs);
2310 out_unlock:
2311         mutex_unlock(&cpuset_mutex);
2312         kernfs_unbreak_active_protection(of->kn);
2313         css_put(&cs->css);
2314         flush_workqueue(cpuset_migrate_mm_wq);
2315         return retval ?: nbytes;
2316 }
2317
2318 /*
2319  * These ascii lists should be read in a single call, by using a user
2320  * buffer large enough to hold the entire map.  If read in smaller
2321  * chunks, there is no guarantee of atomicity.  Since the display format
2322  * used, list of ranges of sequential numbers, is variable length,
2323  * and since these maps can change value dynamically, one could read
2324  * gibberish by doing partial reads while a list was changing.
2325  */
2326 static int cpuset_common_seq_show(struct seq_file *sf, void *v)
2327 {
2328         struct cpuset *cs = css_cs(seq_css(sf));
2329         cpuset_filetype_t type = seq_cft(sf)->private;
2330         int ret = 0;
2331
2332         spin_lock_irq(&callback_lock);
2333
2334         switch (type) {
2335         case FILE_CPULIST:
2336                 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
2337                 break;
2338         case FILE_MEMLIST:
2339                 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
2340                 break;
2341         case FILE_EFFECTIVE_CPULIST:
2342                 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
2343                 break;
2344         case FILE_EFFECTIVE_MEMLIST:
2345                 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
2346                 break;
2347         default:
2348                 ret = -EINVAL;
2349         }
2350
2351         spin_unlock_irq(&callback_lock);
2352         return ret;
2353 }
2354
2355 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
2356 {
2357         struct cpuset *cs = css_cs(css);
2358         cpuset_filetype_t type = cft->private;
2359         switch (type) {
2360         case FILE_CPU_EXCLUSIVE:
2361                 return is_cpu_exclusive(cs);
2362         case FILE_MEM_EXCLUSIVE:
2363                 return is_mem_exclusive(cs);
2364         case FILE_MEM_HARDWALL:
2365                 return is_mem_hardwall(cs);
2366         case FILE_SCHED_LOAD_BALANCE:
2367                 return is_sched_load_balance(cs);
2368         case FILE_MEMORY_MIGRATE:
2369                 return is_memory_migrate(cs);
2370         case FILE_MEMORY_PRESSURE_ENABLED:
2371                 return cpuset_memory_pressure_enabled;
2372         case FILE_MEMORY_PRESSURE:
2373                 return fmeter_getrate(&cs->fmeter);
2374         case FILE_SPREAD_PAGE:
2375                 return is_spread_page(cs);
2376         case FILE_SPREAD_SLAB:
2377                 return is_spread_slab(cs);
2378         default:
2379                 BUG();
2380         }
2381
2382         /* Unreachable but makes gcc happy */
2383         return 0;
2384 }
2385
2386 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
2387 {
2388         struct cpuset *cs = css_cs(css);
2389         cpuset_filetype_t type = cft->private;
2390         switch (type) {
2391         case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2392                 return cs->relax_domain_level;
2393         case FILE_PARTITION_ROOT:
2394                 return cs->partition_root_state;
2395         default:
2396                 BUG();
2397         }
2398
2399         /* Unrechable but makes gcc happy */
2400         return 0;
2401 }
2402
2403 /*
2404  * for the common functions, 'private' gives the type of file
2405  */
2406
2407 static struct cftype legacy_files[] = {
2408         {
2409                 .name = "cpus",
2410                 .seq_show = cpuset_common_seq_show,
2411                 .write = cpuset_write_resmask,
2412                 .max_write_len = (100U + 6 * NR_CPUS),
2413                 .private = FILE_CPULIST,
2414         },
2415
2416         {
2417                 .name = "mems",
2418                 .seq_show = cpuset_common_seq_show,
2419                 .write = cpuset_write_resmask,
2420                 .max_write_len = (100U + 6 * MAX_NUMNODES),
2421                 .private = FILE_MEMLIST,
2422         },
2423
2424         {
2425                 .name = "effective_cpus",
2426                 .seq_show = cpuset_common_seq_show,
2427                 .private = FILE_EFFECTIVE_CPULIST,
2428         },
2429
2430         {
2431                 .name = "effective_mems",
2432                 .seq_show = cpuset_common_seq_show,
2433                 .private = FILE_EFFECTIVE_MEMLIST,
2434         },
2435
2436         {
2437                 .name = "cpu_exclusive",
2438                 .read_u64 = cpuset_read_u64,
2439                 .write_u64 = cpuset_write_u64,
2440                 .private = FILE_CPU_EXCLUSIVE,
2441         },
2442
2443         {
2444                 .name = "mem_exclusive",
2445                 .read_u64 = cpuset_read_u64,
2446                 .write_u64 = cpuset_write_u64,
2447                 .private = FILE_MEM_EXCLUSIVE,
2448         },
2449
2450         {
2451                 .name = "mem_hardwall",
2452                 .read_u64 = cpuset_read_u64,
2453                 .write_u64 = cpuset_write_u64,
2454                 .private = FILE_MEM_HARDWALL,
2455         },
2456
2457         {
2458                 .name = "sched_load_balance",
2459                 .read_u64 = cpuset_read_u64,
2460                 .write_u64 = cpuset_write_u64,
2461                 .private = FILE_SCHED_LOAD_BALANCE,
2462         },
2463
2464         {
2465                 .name = "sched_relax_domain_level",
2466                 .read_s64 = cpuset_read_s64,
2467                 .write_s64 = cpuset_write_s64,
2468                 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
2469         },
2470
2471         {
2472                 .name = "memory_migrate",
2473                 .read_u64 = cpuset_read_u64,
2474                 .write_u64 = cpuset_write_u64,
2475                 .private = FILE_MEMORY_MIGRATE,
2476         },
2477
2478         {
2479                 .name = "memory_pressure",
2480                 .read_u64 = cpuset_read_u64,
2481                 .private = FILE_MEMORY_PRESSURE,
2482         },
2483
2484         {
2485                 .name = "memory_spread_page",
2486                 .read_u64 = cpuset_read_u64,
2487                 .write_u64 = cpuset_write_u64,
2488                 .private = FILE_SPREAD_PAGE,
2489         },
2490
2491         {
2492                 .name = "memory_spread_slab",
2493                 .read_u64 = cpuset_read_u64,
2494                 .write_u64 = cpuset_write_u64,
2495                 .private = FILE_SPREAD_SLAB,
2496         },
2497
2498         {
2499                 .name = "memory_pressure_enabled",
2500                 .flags = CFTYPE_ONLY_ON_ROOT,
2501                 .read_u64 = cpuset_read_u64,
2502                 .write_u64 = cpuset_write_u64,
2503                 .private = FILE_MEMORY_PRESSURE_ENABLED,
2504         },
2505
2506         { }     /* terminate */
2507 };
2508
2509 /*
2510  * This is currently a minimal set for the default hierarchy. It can be
2511  * expanded later on by migrating more features and control files from v1.
2512  */
2513 static struct cftype dfl_files[] = {
2514         {
2515                 .name = "cpus",
2516                 .seq_show = cpuset_common_seq_show,
2517                 .write = cpuset_write_resmask,
2518                 .max_write_len = (100U + 6 * NR_CPUS),
2519                 .private = FILE_CPULIST,
2520                 .flags = CFTYPE_NOT_ON_ROOT,
2521         },
2522
2523         {
2524                 .name = "mems",
2525                 .seq_show = cpuset_common_seq_show,
2526                 .write = cpuset_write_resmask,
2527                 .max_write_len = (100U + 6 * MAX_NUMNODES),
2528                 .private = FILE_MEMLIST,
2529                 .flags = CFTYPE_NOT_ON_ROOT,
2530         },
2531
2532         {
2533                 .name = "cpus.effective",
2534                 .seq_show = cpuset_common_seq_show,
2535                 .private = FILE_EFFECTIVE_CPULIST,
2536                 .flags = CFTYPE_NOT_ON_ROOT,
2537         },
2538
2539         {
2540                 .name = "mems.effective",
2541                 .seq_show = cpuset_common_seq_show,
2542                 .private = FILE_EFFECTIVE_MEMLIST,
2543                 .flags = CFTYPE_NOT_ON_ROOT,
2544         },
2545
2546         {
2547                 .name = "sched.partition",
2548                 .read_s64 = cpuset_read_s64,
2549                 .write_s64 = cpuset_write_s64,
2550                 .private = FILE_PARTITION_ROOT,
2551                 .flags = CFTYPE_NOT_ON_ROOT,
2552         },
2553
2554         { }     /* terminate */
2555 };
2556
2557
2558 /*
2559  *      cpuset_css_alloc - allocate a cpuset css
2560  *      cgrp:   control group that the new cpuset will be part of
2561  */
2562
2563 static struct cgroup_subsys_state *
2564 cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
2565 {
2566         struct cpuset *cs;
2567
2568         if (!parent_css)
2569                 return &top_cpuset.css;
2570
2571         cs = kzalloc(sizeof(*cs), GFP_KERNEL);
2572         if (!cs)
2573                 return ERR_PTR(-ENOMEM);
2574
2575         if (alloc_cpumasks(cs, NULL)) {
2576                 kfree(cs);
2577                 return ERR_PTR(-ENOMEM);
2578         }
2579
2580         set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
2581         nodes_clear(cs->mems_allowed);
2582         nodes_clear(cs->effective_mems);
2583         fmeter_init(&cs->fmeter);
2584         cs->relax_domain_level = -1;
2585
2586         return &cs->css;
2587 }
2588
2589 static int cpuset_css_online(struct cgroup_subsys_state *css)
2590 {
2591         struct cpuset *cs = css_cs(css);
2592         struct cpuset *parent = parent_cs(cs);
2593         struct cpuset *tmp_cs;
2594         struct cgroup_subsys_state *pos_css;
2595
2596         if (!parent)
2597                 return 0;
2598
2599         mutex_lock(&cpuset_mutex);
2600
2601         set_bit(CS_ONLINE, &cs->flags);
2602         if (is_spread_page(parent))
2603                 set_bit(CS_SPREAD_PAGE, &cs->flags);
2604         if (is_spread_slab(parent))
2605                 set_bit(CS_SPREAD_SLAB, &cs->flags);
2606
2607         cpuset_inc();
2608
2609         spin_lock_irq(&callback_lock);
2610         if (is_in_v2_mode()) {
2611                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
2612                 cs->effective_mems = parent->effective_mems;
2613                 cs->use_parent_ecpus = true;
2614                 parent->child_ecpus_count++;
2615         }
2616         spin_unlock_irq(&callback_lock);
2617
2618         if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
2619                 goto out_unlock;
2620
2621         /*
2622          * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
2623          * set.  This flag handling is implemented in cgroup core for
2624          * histrical reasons - the flag may be specified during mount.
2625          *
2626          * Currently, if any sibling cpusets have exclusive cpus or mem, we
2627          * refuse to clone the configuration - thereby refusing the task to
2628          * be entered, and as a result refusing the sys_unshare() or
2629          * clone() which initiated it.  If this becomes a problem for some
2630          * users who wish to allow that scenario, then this could be
2631          * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
2632          * (and likewise for mems) to the new cgroup.
2633          */
2634         rcu_read_lock();
2635         cpuset_for_each_child(tmp_cs, pos_css, parent) {
2636                 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2637                         rcu_read_unlock();
2638                         goto out_unlock;
2639                 }
2640         }
2641         rcu_read_unlock();
2642
2643         spin_lock_irq(&callback_lock);
2644         cs->mems_allowed = parent->mems_allowed;
2645         cs->effective_mems = parent->mems_allowed;
2646         cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
2647         cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
2648         spin_unlock_irq(&callback_lock);
2649 out_unlock:
2650         mutex_unlock(&cpuset_mutex);
2651         return 0;
2652 }
2653
2654 /*
2655  * If the cpuset being removed has its flag 'sched_load_balance'
2656  * enabled, then simulate turning sched_load_balance off, which
2657  * will call rebuild_sched_domains_locked(). That is not needed
2658  * in the default hierarchy where only changes in partition
2659  * will cause repartitioning.
2660  *
2661  * If the cpuset has the 'sched.partition' flag enabled, simulate
2662  * turning 'sched.partition" off.
2663  */
2664
2665 static void cpuset_css_offline(struct cgroup_subsys_state *css)
2666 {
2667         struct cpuset *cs = css_cs(css);
2668
2669         mutex_lock(&cpuset_mutex);
2670
2671         if (is_partition_root(cs))
2672                 update_prstate(cs, 0);
2673
2674         if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
2675             is_sched_load_balance(cs))
2676                 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2677
2678         if (cs->use_parent_ecpus) {
2679                 struct cpuset *parent = parent_cs(cs);
2680
2681                 cs->use_parent_ecpus = false;
2682                 parent->child_ecpus_count--;
2683         }
2684
2685         cpuset_dec();
2686         clear_bit(CS_ONLINE, &cs->flags);
2687
2688         mutex_unlock(&cpuset_mutex);
2689 }
2690
2691 static void cpuset_css_free(struct cgroup_subsys_state *css)
2692 {
2693         struct cpuset *cs = css_cs(css);
2694
2695         free_cpuset(cs);
2696 }
2697
2698 static void cpuset_bind(struct cgroup_subsys_state *root_css)
2699 {
2700         mutex_lock(&cpuset_mutex);
2701         spin_lock_irq(&callback_lock);
2702
2703         if (is_in_v2_mode()) {
2704                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2705                 top_cpuset.mems_allowed = node_possible_map;
2706         } else {
2707                 cpumask_copy(top_cpuset.cpus_allowed,
2708                              top_cpuset.effective_cpus);
2709                 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2710         }
2711
2712         spin_unlock_irq(&callback_lock);
2713         mutex_unlock(&cpuset_mutex);
2714 }
2715
2716 /*
2717  * Make sure the new task conform to the current state of its parent,
2718  * which could have been changed by cpuset just after it inherits the
2719  * state from the parent and before it sits on the cgroup's task list.
2720  */
2721 static void cpuset_fork(struct task_struct *task)
2722 {
2723         if (task_css_is_root(task, cpuset_cgrp_id))
2724                 return;
2725
2726         set_cpus_allowed_ptr(task, &current->cpus_allowed);
2727         task->mems_allowed = current->mems_allowed;
2728 }
2729
2730 struct cgroup_subsys cpuset_cgrp_subsys = {
2731         .css_alloc      = cpuset_css_alloc,
2732         .css_online     = cpuset_css_online,
2733         .css_offline    = cpuset_css_offline,
2734         .css_free       = cpuset_css_free,
2735         .can_attach     = cpuset_can_attach,
2736         .cancel_attach  = cpuset_cancel_attach,
2737         .attach         = cpuset_attach,
2738         .post_attach    = cpuset_post_attach,
2739         .bind           = cpuset_bind,
2740         .fork           = cpuset_fork,
2741         .legacy_cftypes = legacy_files,
2742         .dfl_cftypes    = dfl_files,
2743         .early_init     = true,
2744         .threaded       = true,
2745 };
2746
2747 /**
2748  * cpuset_init - initialize cpusets at system boot
2749  *
2750  * Description: Initialize top_cpuset and the cpuset internal file system,
2751  **/
2752
2753 int __init cpuset_init(void)
2754 {
2755         int err = 0;
2756
2757         BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
2758         BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
2759         BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
2760
2761         cpumask_setall(top_cpuset.cpus_allowed);
2762         nodes_setall(top_cpuset.mems_allowed);
2763         cpumask_setall(top_cpuset.effective_cpus);
2764         nodes_setall(top_cpuset.effective_mems);
2765
2766         fmeter_init(&top_cpuset.fmeter);
2767         set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
2768         top_cpuset.relax_domain_level = -1;
2769
2770         err = register_filesystem(&cpuset_fs_type);
2771         if (err < 0)
2772                 return err;
2773
2774         BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
2775
2776         return 0;
2777 }
2778
2779 /*
2780  * If CPU and/or memory hotplug handlers, below, unplug any CPUs
2781  * or memory nodes, we need to walk over the cpuset hierarchy,
2782  * removing that CPU or node from all cpusets.  If this removes the
2783  * last CPU or node from a cpuset, then move the tasks in the empty
2784  * cpuset to its next-highest non-empty parent.
2785  */
2786 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2787 {
2788         struct cpuset *parent;
2789
2790         /*
2791          * Find its next-highest non-empty parent, (top cpuset
2792          * has online cpus, so can't be empty).
2793          */
2794         parent = parent_cs(cs);
2795         while (cpumask_empty(parent->cpus_allowed) ||
2796                         nodes_empty(parent->mems_allowed))
2797                 parent = parent_cs(parent);
2798
2799         if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2800                 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
2801                 pr_cont_cgroup_name(cs->css.cgroup);
2802                 pr_cont("\n");
2803         }
2804 }
2805
2806 static void
2807 hotplug_update_tasks_legacy(struct cpuset *cs,
2808                             struct cpumask *new_cpus, nodemask_t *new_mems,
2809                             bool cpus_updated, bool mems_updated)
2810 {
2811         bool is_empty;
2812
2813         spin_lock_irq(&callback_lock);
2814         cpumask_copy(cs->cpus_allowed, new_cpus);
2815         cpumask_copy(cs->effective_cpus, new_cpus);
2816         cs->mems_allowed = *new_mems;
2817         cs->effective_mems = *new_mems;
2818         spin_unlock_irq(&callback_lock);
2819
2820         /*
2821          * Don't call update_tasks_cpumask() if the cpuset becomes empty,
2822          * as the tasks will be migratecd to an ancestor.
2823          */
2824         if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
2825                 update_tasks_cpumask(cs);
2826         if (mems_updated && !nodes_empty(cs->mems_allowed))
2827                 update_tasks_nodemask(cs);
2828
2829         is_empty = cpumask_empty(cs->cpus_allowed) ||
2830                    nodes_empty(cs->mems_allowed);
2831
2832         mutex_unlock(&cpuset_mutex);
2833
2834         /*
2835          * Move tasks to the nearest ancestor with execution resources,
2836          * This is full cgroup operation which will also call back into
2837          * cpuset. Should be done outside any lock.
2838          */
2839         if (is_empty)
2840                 remove_tasks_in_empty_cpuset(cs);
2841
2842         mutex_lock(&cpuset_mutex);
2843 }
2844
2845 static void
2846 hotplug_update_tasks(struct cpuset *cs,
2847                      struct cpumask *new_cpus, nodemask_t *new_mems,
2848                      bool cpus_updated, bool mems_updated)
2849 {
2850         if (cpumask_empty(new_cpus))
2851                 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
2852         if (nodes_empty(*new_mems))
2853                 *new_mems = parent_cs(cs)->effective_mems;
2854
2855         spin_lock_irq(&callback_lock);
2856         cpumask_copy(cs->effective_cpus, new_cpus);
2857         cs->effective_mems = *new_mems;
2858         spin_unlock_irq(&callback_lock);
2859
2860         if (cpus_updated)
2861                 update_tasks_cpumask(cs);
2862         if (mems_updated)
2863                 update_tasks_nodemask(cs);
2864 }
2865
2866 /**
2867  * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
2868  * @cs: cpuset in interest
2869  *
2870  * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
2871  * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
2872  * all its tasks are moved to the nearest ancestor with both resources.
2873  */
2874 static void cpuset_hotplug_update_tasks(struct cpuset *cs)
2875 {
2876         static cpumask_t new_cpus;
2877         static nodemask_t new_mems;
2878         bool cpus_updated;
2879         bool mems_updated;
2880 retry:
2881         wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
2882
2883         mutex_lock(&cpuset_mutex);
2884
2885         /*
2886          * We have raced with task attaching. We wait until attaching
2887          * is finished, so we won't attach a task to an empty cpuset.
2888          */
2889         if (cs->attach_in_progress) {
2890                 mutex_unlock(&cpuset_mutex);
2891                 goto retry;
2892         }
2893
2894         cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
2895         nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
2896
2897         cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
2898         mems_updated = !nodes_equal(new_mems, cs->effective_mems);
2899
2900         if (is_in_v2_mode())
2901                 hotplug_update_tasks(cs, &new_cpus, &new_mems,
2902                                      cpus_updated, mems_updated);
2903         else
2904                 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
2905                                             cpus_updated, mems_updated);
2906
2907         mutex_unlock(&cpuset_mutex);
2908 }
2909
2910 static bool force_rebuild;
2911
2912 void cpuset_force_rebuild(void)
2913 {
2914         force_rebuild = true;
2915 }
2916
2917 /**
2918  * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
2919  *
2920  * This function is called after either CPU or memory configuration has
2921  * changed and updates cpuset accordingly.  The top_cpuset is always
2922  * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
2923  * order to make cpusets transparent (of no affect) on systems that are
2924  * actively using CPU hotplug but making no active use of cpusets.
2925  *
2926  * Non-root cpusets are only affected by offlining.  If any CPUs or memory
2927  * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
2928  * all descendants.
2929  *
2930  * Note that CPU offlining during suspend is ignored.  We don't modify
2931  * cpusets across suspend/resume cycles at all.
2932  */
2933 static void cpuset_hotplug_workfn(struct work_struct *work)
2934 {
2935         static cpumask_t new_cpus;
2936         static nodemask_t new_mems;
2937         bool cpus_updated, mems_updated;
2938         bool on_dfl = is_in_v2_mode();
2939
2940         mutex_lock(&cpuset_mutex);
2941
2942         /* fetch the available cpus/mems and find out which changed how */
2943         cpumask_copy(&new_cpus, cpu_active_mask);
2944         new_mems = node_states[N_MEMORY];
2945
2946         cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
2947         mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
2948
2949         /* synchronize cpus_allowed to cpu_active_mask */
2950         if (cpus_updated) {
2951                 spin_lock_irq(&callback_lock);
2952                 if (!on_dfl)
2953                         cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2954                 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
2955                 spin_unlock_irq(&callback_lock);
2956                 /* we don't mess with cpumasks of tasks in top_cpuset */
2957         }
2958
2959         /* synchronize mems_allowed to N_MEMORY */
2960         if (mems_updated) {
2961                 spin_lock_irq(&callback_lock);
2962                 if (!on_dfl)
2963                         top_cpuset.mems_allowed = new_mems;
2964                 top_cpuset.effective_mems = new_mems;
2965                 spin_unlock_irq(&callback_lock);
2966                 update_tasks_nodemask(&top_cpuset);
2967         }
2968
2969         mutex_unlock(&cpuset_mutex);
2970
2971         /* if cpus or mems changed, we need to propagate to descendants */
2972         if (cpus_updated || mems_updated) {
2973                 struct cpuset *cs;
2974                 struct cgroup_subsys_state *pos_css;
2975
2976                 rcu_read_lock();
2977                 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2978                         if (cs == &top_cpuset || !css_tryget_online(&cs->css))
2979                                 continue;
2980                         rcu_read_unlock();
2981
2982                         cpuset_hotplug_update_tasks(cs);
2983
2984                         rcu_read_lock();
2985                         css_put(&cs->css);
2986                 }
2987                 rcu_read_unlock();
2988         }
2989
2990         /* rebuild sched domains if cpus_allowed has changed */
2991         if (cpus_updated || force_rebuild) {
2992                 force_rebuild = false;
2993                 rebuild_sched_domains();
2994         }
2995 }
2996
2997 void cpuset_update_active_cpus(void)
2998 {
2999         /*
3000          * We're inside cpu hotplug critical region which usually nests
3001          * inside cgroup synchronization.  Bounce actual hotplug processing
3002          * to a work item to avoid reverse locking order.
3003          */
3004         schedule_work(&cpuset_hotplug_work);
3005 }
3006
3007 void cpuset_wait_for_hotplug(void)
3008 {
3009         flush_work(&cpuset_hotplug_work);
3010 }
3011
3012 /*
3013  * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
3014  * Call this routine anytime after node_states[N_MEMORY] changes.
3015  * See cpuset_update_active_cpus() for CPU hotplug handling.
3016  */
3017 static int cpuset_track_online_nodes(struct notifier_block *self,
3018                                 unsigned long action, void *arg)
3019 {
3020         schedule_work(&cpuset_hotplug_work);
3021         return NOTIFY_OK;
3022 }
3023
3024 static struct notifier_block cpuset_track_online_nodes_nb = {
3025         .notifier_call = cpuset_track_online_nodes,
3026         .priority = 10,         /* ??! */
3027 };
3028
3029 /**
3030  * cpuset_init_smp - initialize cpus_allowed
3031  *
3032  * Description: Finish top cpuset after cpu, node maps are initialized
3033  */
3034 void __init cpuset_init_smp(void)
3035 {
3036         cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
3037         top_cpuset.mems_allowed = node_states[N_MEMORY];
3038         top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
3039
3040         cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
3041         top_cpuset.effective_mems = node_states[N_MEMORY];
3042
3043         register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
3044
3045         cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
3046         BUG_ON(!cpuset_migrate_mm_wq);
3047 }
3048
3049 /**
3050  * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
3051  * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
3052  * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
3053  *
3054  * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
3055  * attached to the specified @tsk.  Guaranteed to return some non-empty
3056  * subset of cpu_online_mask, even if this means going outside the
3057  * tasks cpuset.
3058  **/
3059
3060 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
3061 {
3062         unsigned long flags;
3063
3064         spin_lock_irqsave(&callback_lock, flags);
3065         rcu_read_lock();
3066         guarantee_online_cpus(task_cs(tsk), pmask);
3067         rcu_read_unlock();
3068         spin_unlock_irqrestore(&callback_lock, flags);
3069 }
3070
3071 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
3072 {
3073         rcu_read_lock();
3074         do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
3075         rcu_read_unlock();
3076
3077         /*
3078          * We own tsk->cpus_allowed, nobody can change it under us.
3079          *
3080          * But we used cs && cs->cpus_allowed lockless and thus can
3081          * race with cgroup_attach_task() or update_cpumask() and get
3082          * the wrong tsk->cpus_allowed. However, both cases imply the
3083          * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
3084          * which takes task_rq_lock().
3085          *
3086          * If we are called after it dropped the lock we must see all
3087          * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
3088          * set any mask even if it is not right from task_cs() pov,
3089          * the pending set_cpus_allowed_ptr() will fix things.
3090          *
3091          * select_fallback_rq() will fix things ups and set cpu_possible_mask
3092          * if required.
3093          */
3094 }
3095
3096 void __init cpuset_init_current_mems_allowed(void)
3097 {
3098         nodes_setall(current->mems_allowed);
3099 }
3100
3101 /**
3102  * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
3103  * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
3104  *
3105  * Description: Returns the nodemask_t mems_allowed of the cpuset
3106  * attached to the specified @tsk.  Guaranteed to return some non-empty
3107  * subset of node_states[N_MEMORY], even if this means going outside the
3108  * tasks cpuset.
3109  **/
3110
3111 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
3112 {
3113         nodemask_t mask;
3114         unsigned long flags;
3115
3116         spin_lock_irqsave(&callback_lock, flags);
3117         rcu_read_lock();
3118         guarantee_online_mems(task_cs(tsk), &mask);
3119         rcu_read_unlock();
3120         spin_unlock_irqrestore(&callback_lock, flags);
3121
3122         return mask;
3123 }
3124
3125 /**
3126  * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
3127  * @nodemask: the nodemask to be checked
3128  *
3129  * Are any of the nodes in the nodemask allowed in current->mems_allowed?
3130  */
3131 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
3132 {
3133         return nodes_intersects(*nodemask, current->mems_allowed);
3134 }
3135
3136 /*
3137  * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
3138  * mem_hardwall ancestor to the specified cpuset.  Call holding
3139  * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
3140  * (an unusual configuration), then returns the root cpuset.
3141  */
3142 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
3143 {
3144         while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
3145                 cs = parent_cs(cs);
3146         return cs;
3147 }
3148
3149 /**
3150  * cpuset_node_allowed - Can we allocate on a memory node?
3151  * @node: is this an allowed node?
3152  * @gfp_mask: memory allocation flags
3153  *
3154  * If we're in interrupt, yes, we can always allocate.  If @node is set in
3155  * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
3156  * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
3157  * yes.  If current has access to memory reserves as an oom victim, yes.
3158  * Otherwise, no.
3159  *
3160  * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
3161  * and do not allow allocations outside the current tasks cpuset
3162  * unless the task has been OOM killed.
3163  * GFP_KERNEL allocations are not so marked, so can escape to the
3164  * nearest enclosing hardwalled ancestor cpuset.
3165  *
3166  * Scanning up parent cpusets requires callback_lock.  The
3167  * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
3168  * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
3169  * current tasks mems_allowed came up empty on the first pass over
3170  * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
3171  * cpuset are short of memory, might require taking the callback_lock.
3172  *
3173  * The first call here from mm/page_alloc:get_page_from_freelist()
3174  * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
3175  * so no allocation on a node outside the cpuset is allowed (unless
3176  * in interrupt, of course).
3177  *
3178  * The second pass through get_page_from_freelist() doesn't even call
3179  * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
3180  * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
3181  * in alloc_flags.  That logic and the checks below have the combined
3182  * affect that:
3183  *      in_interrupt - any node ok (current task context irrelevant)
3184  *      GFP_ATOMIC   - any node ok
3185  *      tsk_is_oom_victim   - any node ok
3186  *      GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
3187  *      GFP_USER     - only nodes in current tasks mems allowed ok.
3188  */
3189 bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
3190 {
3191         struct cpuset *cs;              /* current cpuset ancestors */
3192         int allowed;                    /* is allocation in zone z allowed? */
3193         unsigned long flags;
3194
3195         if (in_interrupt())
3196                 return true;
3197         if (node_isset(node, current->mems_allowed))
3198                 return true;
3199         /*
3200          * Allow tasks that have access to memory reserves because they have
3201          * been OOM killed to get memory anywhere.
3202          */
3203         if (unlikely(tsk_is_oom_victim(current)))
3204                 return true;
3205         if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
3206                 return false;
3207
3208         if (current->flags & PF_EXITING) /* Let dying task have memory */
3209                 return true;
3210
3211         /* Not hardwall and node outside mems_allowed: scan up cpusets */
3212         spin_lock_irqsave(&callback_lock, flags);
3213
3214         rcu_read_lock();
3215         cs = nearest_hardwall_ancestor(task_cs(current));
3216         allowed = node_isset(node, cs->mems_allowed);
3217         rcu_read_unlock();
3218
3219         spin_unlock_irqrestore(&callback_lock, flags);
3220         return allowed;
3221 }
3222
3223 /**
3224  * cpuset_mem_spread_node() - On which node to begin search for a file page
3225  * cpuset_slab_spread_node() - On which node to begin search for a slab page
3226  *
3227  * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
3228  * tasks in a cpuset with is_spread_page or is_spread_slab set),
3229  * and if the memory allocation used cpuset_mem_spread_node()
3230  * to determine on which node to start looking, as it will for
3231  * certain page cache or slab cache pages such as used for file
3232  * system buffers and inode caches, then instead of starting on the
3233  * local node to look for a free page, rather spread the starting
3234  * node around the tasks mems_allowed nodes.
3235  *
3236  * We don't have to worry about the returned node being offline
3237  * because "it can't happen", and even if it did, it would be ok.
3238  *
3239  * The routines calling guarantee_online_mems() are careful to
3240  * only set nodes in task->mems_allowed that are online.  So it
3241  * should not be possible for the following code to return an
3242  * offline node.  But if it did, that would be ok, as this routine
3243  * is not returning the node where the allocation must be, only
3244  * the node where the search should start.  The zonelist passed to
3245  * __alloc_pages() will include all nodes.  If the slab allocator
3246  * is passed an offline node, it will fall back to the local node.
3247  * See kmem_cache_alloc_node().
3248  */
3249
3250 static int cpuset_spread_node(int *rotor)
3251 {
3252         return *rotor = next_node_in(*rotor, current->mems_allowed);
3253 }
3254
3255 int cpuset_mem_spread_node(void)
3256 {
3257         if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
3258                 current->cpuset_mem_spread_rotor =
3259                         node_random(&current->mems_allowed);
3260
3261         return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
3262 }
3263
3264 int cpuset_slab_spread_node(void)
3265 {
3266         if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
3267                 current->cpuset_slab_spread_rotor =
3268                         node_random(&current->mems_allowed);
3269
3270         return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
3271 }
3272
3273 EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
3274
3275 /**
3276  * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
3277  * @tsk1: pointer to task_struct of some task.
3278  * @tsk2: pointer to task_struct of some other task.
3279  *
3280  * Description: Return true if @tsk1's mems_allowed intersects the
3281  * mems_allowed of @tsk2.  Used by the OOM killer to determine if
3282  * one of the task's memory usage might impact the memory available
3283  * to the other.
3284  **/
3285
3286 int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
3287                                    const struct task_struct *tsk2)
3288 {
3289         return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
3290 }
3291
3292 /**
3293  * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
3294  *
3295  * Description: Prints current's name, cpuset name, and cached copy of its
3296  * mems_allowed to the kernel log.
3297  */
3298 void cpuset_print_current_mems_allowed(void)
3299 {
3300         struct cgroup *cgrp;
3301
3302         rcu_read_lock();
3303
3304         cgrp = task_cs(current)->css.cgroup;
3305         pr_info("%s cpuset=", current->comm);
3306         pr_cont_cgroup_name(cgrp);
3307         pr_cont(" mems_allowed=%*pbl\n",
3308                 nodemask_pr_args(&current->mems_allowed));
3309
3310         rcu_read_unlock();
3311 }
3312
3313 /*
3314  * Collection of memory_pressure is suppressed unless
3315  * this flag is enabled by writing "1" to the special
3316  * cpuset file 'memory_pressure_enabled' in the root cpuset.
3317  */
3318
3319 int cpuset_memory_pressure_enabled __read_mostly;
3320
3321 /**
3322  * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
3323  *
3324  * Keep a running average of the rate of synchronous (direct)
3325  * page reclaim efforts initiated by tasks in each cpuset.
3326  *
3327  * This represents the rate at which some task in the cpuset
3328  * ran low on memory on all nodes it was allowed to use, and
3329  * had to enter the kernels page reclaim code in an effort to
3330  * create more free memory by tossing clean pages or swapping
3331  * or writing dirty pages.
3332  *
3333  * Display to user space in the per-cpuset read-only file
3334  * "memory_pressure".  Value displayed is an integer
3335  * representing the recent rate of entry into the synchronous
3336  * (direct) page reclaim by any task attached to the cpuset.
3337  **/
3338
3339 void __cpuset_memory_pressure_bump(void)
3340 {
3341         rcu_read_lock();
3342         fmeter_markevent(&task_cs(current)->fmeter);
3343         rcu_read_unlock();
3344 }
3345
3346 #ifdef CONFIG_PROC_PID_CPUSET
3347 /*
3348  * proc_cpuset_show()
3349  *  - Print tasks cpuset path into seq_file.
3350  *  - Used for /proc/<pid>/cpuset.
3351  *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
3352  *    doesn't really matter if tsk->cpuset changes after we read it,
3353  *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
3354  *    anyway.
3355  */
3356 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
3357                      struct pid *pid, struct task_struct *tsk)
3358 {
3359         char *buf;
3360         struct cgroup_subsys_state *css;
3361         int retval;
3362
3363         retval = -ENOMEM;
3364         buf = kmalloc(PATH_MAX, GFP_KERNEL);
3365         if (!buf)
3366                 goto out;
3367
3368         css = task_get_css(tsk, cpuset_cgrp_id);
3369         retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
3370                                 current->nsproxy->cgroup_ns);
3371         css_put(css);
3372         if (retval >= PATH_MAX)
3373                 retval = -ENAMETOOLONG;
3374         if (retval < 0)
3375                 goto out_free;
3376         seq_puts(m, buf);
3377         seq_putc(m, '\n');
3378         retval = 0;
3379 out_free:
3380         kfree(buf);
3381 out:
3382         return retval;
3383 }
3384 #endif /* CONFIG_PROC_PID_CPUSET */
3385
3386 /* Display task mems_allowed in /proc/<pid>/status file. */
3387 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
3388 {
3389         seq_printf(m, "Mems_allowed:\t%*pb\n",
3390                    nodemask_pr_args(&task->mems_allowed));
3391         seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
3392                    nodemask_pr_args(&task->mems_allowed));
3393 }