Merge branch 'for-4.21' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
[linux-2.6-microblaze.git] / kernel / cgroup / cpuset.c
index 21eaa89..479743d 100644 (file)
@@ -113,6 +113,9 @@ struct cpuset {
         * CPUs allocated to child sub-partitions (default hierarchy only)
         * - CPUs granted by the parent = effective_cpus U subparts_cpus
         * - effective_cpus and subparts_cpus are mutually exclusive.
+        *
+        * effective_cpus contains only onlined CPUs, but subparts_cpus
+        * may have offlined ones.
         */
        cpumask_var_t subparts_cpus;
 
@@ -766,13 +769,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
        int ndoms = 0;          /* number of sched domains in result */
        int nslot;              /* next empty doms[] struct cpumask slot */
        struct cgroup_subsys_state *pos_css;
+       bool root_load_balance = is_sched_load_balance(&top_cpuset);
 
        doms = NULL;
        dattr = NULL;
        csa = NULL;
 
        /* Special case for the 99% of systems with one, full, sched domain */
-       if (is_sched_load_balance(&top_cpuset)) {
+       if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
                ndoms = 1;
                doms = alloc_sched_domains(ndoms);
                if (!doms)
@@ -795,6 +799,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
        csn = 0;
 
        rcu_read_lock();
+       if (root_load_balance)
+               csa[csn++] = &top_cpuset;
        cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
                if (cp == &top_cpuset)
                        continue;
@@ -805,6 +811,9 @@ static int generate_sched_domains(cpumask_var_t **domains,
                 * parent's cpus, so just skip them, and then we call
                 * update_domain_attr_tree() to calc relax_domain_level of
                 * the corresponding sched domain.
+                *
+                * If root is load-balancing, we can skip @cp if it
+                * is a subset of the root's effective_cpus.
                 */
                if (!cpumask_empty(cp->cpus_allowed) &&
                    !(is_sched_load_balance(cp) &&
@@ -812,11 +821,16 @@ static int generate_sched_domains(cpumask_var_t **domains,
                                         housekeeping_cpumask(HK_FLAG_DOMAIN))))
                        continue;
 
+               if (root_load_balance &&
+                   cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
+                       continue;
+
                if (is_sched_load_balance(cp))
                        csa[csn++] = cp;
 
-               /* skip @cp's subtree */
-               pos_css = css_rightmost_descendant(pos_css);
+               /* skip @cp's subtree if not a partition root */
+               if (!is_partition_root(cp))
+                       pos_css = css_rightmost_descendant(pos_css);
        }
        rcu_read_unlock();
 
@@ -944,7 +958,12 @@ static void rebuild_sched_domains_locked(void)
         * passing doms with offlined cpu to partition_sched_domains().
         * Anyways, hotplug work item will rebuild sched domains.
         */
-       if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+       if (!top_cpuset.nr_subparts_cpus &&
+           !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+               goto out;
+
+       if (top_cpuset.nr_subparts_cpus &&
+          !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
                goto out;
 
        /* Generate domain masks and attrs */
@@ -994,7 +1013,9 @@ static void update_tasks_cpumask(struct cpuset *cs)
  * @parent: the parent cpuset
  *
  * If the parent has subpartition CPUs, include them in the list of
- * allowable CPUs in computing the new effective_cpus mask.
+ * allowable CPUs in computing the new effective_cpus mask. Since offlined
+ * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
+ * to mask those out.
  */
 static void compute_effective_cpumask(struct cpumask *new_cpus,
                                      struct cpuset *cs, struct cpuset *parent)
@@ -1003,6 +1024,7 @@ static void compute_effective_cpumask(struct cpumask *new_cpus,
                cpumask_or(new_cpus, parent->effective_cpus,
                           parent->subparts_cpus);
                cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
+               cpumask_and(new_cpus, new_cpus, cpu_active_mask);
        } else {
                cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
        }
@@ -1125,9 +1147,20 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
                /*
                 * Return error if the new effective_cpus could become empty.
                 */
-               if (adding && !deleting &&
-                   cpumask_equal(parent->effective_cpus, tmp->addmask))
-                       return -EINVAL;
+               if (adding &&
+                   cpumask_equal(parent->effective_cpus, tmp->addmask)) {
+                       if (!deleting)
+                               return -EINVAL;
+                       /*
+                        * As some of the CPUs in subparts_cpus might have
+                        * been offlined, we need to compute the real delmask
+                        * to confirm that.
+                        */
+                       if (!cpumask_and(tmp->addmask, tmp->delmask,
+                                        cpu_active_mask))
+                               return -EINVAL;
+                       cpumask_copy(tmp->addmask, parent->effective_cpus);
+               }
        } else {
                /*
                 * partcmd_update w/o newmask:
@@ -1197,6 +1230,10 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
        if (deleting) {
                cpumask_andnot(parent->subparts_cpus,
                               parent->subparts_cpus, tmp->delmask);
+               /*
+                * Some of the CPUs in subparts_cpus might have been offlined.
+                */
+               cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
                cpumask_or(parent->effective_cpus,
                           parent->effective_cpus, tmp->delmask);
        }
@@ -1346,11 +1383,15 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
                update_tasks_cpumask(cp);
 
                /*
-                * If the effective cpumask of any non-empty cpuset is changed,
-                * we need to rebuild sched domains.
+                * On legacy hierarchy, if the effective cpumask of any non-
+                * empty cpuset is changed, we need to rebuild sched domains.
+                * On default hierarchy, the cpuset needs to be a partition
+                * root as well.
                 */
                if (!cpumask_empty(cp->cpus_allowed) &&
-                   is_sched_load_balance(cp))
+                   is_sched_load_balance(cp) &&
+                  (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+                   is_partition_root(cp)))
                        need_rebuild_sched_domains = true;
 
                rcu_read_lock();
@@ -2068,10 +2109,8 @@ out_unlock:
 static void cpuset_cancel_attach(struct cgroup_taskset *tset)
 {
        struct cgroup_subsys_state *css;
-       struct cpuset *cs;
 
        cgroup_taskset_first(tset, &css);
-       cs = css_cs(css);
 
        mutex_lock(&cpuset_mutex);
        css_cs(css)->attach_in_progress--;
@@ -2163,6 +2202,7 @@ typedef enum {
        FILE_MEMLIST,
        FILE_EFFECTIVE_CPULIST,
        FILE_EFFECTIVE_MEMLIST,
+       FILE_SUBPARTS_CPULIST,
        FILE_CPU_EXCLUSIVE,
        FILE_MEM_EXCLUSIVE,
        FILE_MEM_HARDWALL,
@@ -2237,9 +2277,6 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
                retval = update_relax_domain_level(cs, val);
                break;
-       case FILE_PARTITION_ROOT:
-               retval = update_prstate(cs, val);
-               break;
        default:
                retval = -EINVAL;
                break;
@@ -2344,6 +2381,9 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
        case FILE_EFFECTIVE_MEMLIST:
                seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
                break;
+       case FILE_SUBPARTS_CPULIST:
+               seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
+               break;
        default:
                ret = -EINVAL;
        }
@@ -2390,8 +2430,6 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
                return cs->relax_domain_level;
-       case FILE_PARTITION_ROOT:
-               return cs->partition_root_state;
        default:
                BUG();
        }
@@ -2400,6 +2438,55 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
        return 0;
 }
 
+static int sched_partition_show(struct seq_file *seq, void *v)
+{
+       struct cpuset *cs = css_cs(seq_css(seq));
+
+       switch (cs->partition_root_state) {
+       case PRS_ENABLED:
+               seq_puts(seq, "root\n");
+               break;
+       case PRS_DISABLED:
+               seq_puts(seq, "member\n");
+               break;
+       case PRS_ERROR:
+               seq_puts(seq, "root invalid\n");
+               break;
+       }
+       return 0;
+}
+
+static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
+                                    size_t nbytes, loff_t off)
+{
+       struct cpuset *cs = css_cs(of_css(of));
+       int val;
+       int retval = -ENODEV;
+
+       buf = strstrip(buf);
+
+       /*
+        * Convert "root" to ENABLED, and convert "member" to DISABLED.
+        */
+       if (!strcmp(buf, "root"))
+               val = PRS_ENABLED;
+       else if (!strcmp(buf, "member"))
+               val = PRS_DISABLED;
+       else
+               return -EINVAL;
+
+       css_get(&cs->css);
+       mutex_lock(&cpuset_mutex);
+       if (!is_cpuset_online(cs))
+               goto out_unlock;
+
+       retval = update_prstate(cs, val);
+out_unlock:
+       mutex_unlock(&cpuset_mutex);
+       css_put(&cs->css);
+       return retval ?: nbytes;
+}
+
 /*
  * for the common functions, 'private' gives the type of file
  */
@@ -2533,24 +2620,29 @@ static struct cftype dfl_files[] = {
                .name = "cpus.effective",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_CPULIST,
-               .flags = CFTYPE_NOT_ON_ROOT,
        },
 
        {
                .name = "mems.effective",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_MEMLIST,
-               .flags = CFTYPE_NOT_ON_ROOT,
        },
 
        {
-               .name = "sched.partition",
-               .read_s64 = cpuset_read_s64,
-               .write_s64 = cpuset_write_s64,
+               .name = "cpus.partition",
+               .seq_show = sched_partition_show,
+               .write = sched_partition_write,
                .private = FILE_PARTITION_ROOT,
                .flags = CFTYPE_NOT_ON_ROOT,
        },
 
+       {
+               .name = "cpus.subpartitions",
+               .seq_show = cpuset_common_seq_show,
+               .private = FILE_SUBPARTS_CPULIST,
+               .flags = CFTYPE_DEBUG,
+       },
+
        { }     /* terminate */
 };
 
@@ -2863,20 +2955,29 @@ hotplug_update_tasks(struct cpuset *cs,
                update_tasks_nodemask(cs);
 }
 
+static bool force_rebuild;
+
+void cpuset_force_rebuild(void)
+{
+       force_rebuild = true;
+}
+
 /**
  * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
  * @cs: cpuset in interest
+ * @tmp: the tmpmasks structure pointer
  *
  * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
  * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
  * all its tasks are moved to the nearest ancestor with both resources.
  */
-static void cpuset_hotplug_update_tasks(struct cpuset *cs)
+static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
 {
        static cpumask_t new_cpus;
        static nodemask_t new_mems;
        bool cpus_updated;
        bool mems_updated;
+       struct cpuset *parent;
 retry:
        wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
 
@@ -2891,9 +2992,60 @@ retry:
                goto retry;
        }
 
-       cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
-       nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
+       parent =  parent_cs(cs);
+       compute_effective_cpumask(&new_cpus, cs, parent);
+       nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
+
+       if (cs->nr_subparts_cpus)
+               /*
+                * Make sure that CPUs allocated to child partitions
+                * do not show up in effective_cpus.
+                */
+               cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
+
+       if (!tmp || !cs->partition_root_state)
+               goto update_tasks;
+
+       /*
+        * In the unlikely event that a partition root has empty
+        * effective_cpus or its parent becomes erroneous, we have to
+        * transition it to the erroneous state.
+        */
+       if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
+          (parent->partition_root_state == PRS_ERROR))) {
+               if (cs->nr_subparts_cpus) {
+                       cs->nr_subparts_cpus = 0;
+                       cpumask_clear(cs->subparts_cpus);
+                       compute_effective_cpumask(&new_cpus, cs, parent);
+               }
+
+               /*
+                * If the effective_cpus is empty because the child
+                * partitions take away all the CPUs, we can keep
+                * the current partition and let the child partitions
+                * fight for available CPUs.
+                */
+               if ((parent->partition_root_state == PRS_ERROR) ||
+                    cpumask_empty(&new_cpus)) {
+                       update_parent_subparts_cpumask(cs, partcmd_disable,
+                                                      NULL, tmp);
+                       cs->partition_root_state = PRS_ERROR;
+               }
+               cpuset_force_rebuild();
+       }
+
+       /*
+        * On the other hand, an erroneous partition root may be transitioned
+        * back to a regular one or a partition root with no CPU allocated
+        * from the parent may change to erroneous.
+        */
+       if (is_partition_root(parent) &&
+          ((cs->partition_root_state == PRS_ERROR) ||
+           !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
+            update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
+               cpuset_force_rebuild();
 
+update_tasks:
        cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
        mems_updated = !nodes_equal(new_mems, cs->effective_mems);
 
@@ -2907,13 +3059,6 @@ retry:
        mutex_unlock(&cpuset_mutex);
 }
 
-static bool force_rebuild;
-
-void cpuset_force_rebuild(void)
-{
-       force_rebuild = true;
-}
-
 /**
  * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
  *
@@ -2936,6 +3081,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
        static nodemask_t new_mems;
        bool cpus_updated, mems_updated;
        bool on_dfl = is_in_v2_mode();
+       struct tmpmasks tmp, *ptmp = NULL;
+
+       if (on_dfl && !alloc_cpumasks(NULL, &tmp))
+               ptmp = &tmp;
 
        mutex_lock(&cpuset_mutex);
 
@@ -2943,6 +3092,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
        cpumask_copy(&new_cpus, cpu_active_mask);
        new_mems = node_states[N_MEMORY];
 
+       /*
+        * If subparts_cpus is populated, it is likely that the check below
+        * will produce a false positive on cpus_updated when the cpu list
+        * isn't changed. It is extra work, but it is better to be safe.
+        */
        cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
        mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
 
@@ -2951,6 +3105,22 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
                spin_lock_irq(&callback_lock);
                if (!on_dfl)
                        cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+               /*
+                * Make sure that CPUs allocated to child partitions
+                * do not show up in effective_cpus. If no CPU is left,
+                * we clear the subparts_cpus & let the child partitions
+                * fight for the CPUs again.
+                */
+               if (top_cpuset.nr_subparts_cpus) {
+                       if (cpumask_subset(&new_cpus,
+                                          top_cpuset.subparts_cpus)) {
+                               top_cpuset.nr_subparts_cpus = 0;
+                               cpumask_clear(top_cpuset.subparts_cpus);
+                       } else {
+                               cpumask_andnot(&new_cpus, &new_cpus,
+                                              top_cpuset.subparts_cpus);
+                       }
+               }
                cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
                spin_unlock_irq(&callback_lock);
                /* we don't mess with cpumasks of tasks in top_cpuset */
@@ -2979,7 +3149,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
                                continue;
                        rcu_read_unlock();
 
-                       cpuset_hotplug_update_tasks(cs);
+                       cpuset_hotplug_update_tasks(cs, ptmp);
 
                        rcu_read_lock();
                        css_put(&cs->css);
@@ -2992,6 +3162,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
                force_rebuild = false;
                rebuild_sched_domains();
        }
+
+       free_cpumasks(NULL, ptmp);
 }
 
 void cpuset_update_active_cpus(void)
@@ -3302,9 +3474,9 @@ void cpuset_print_current_mems_allowed(void)
        rcu_read_lock();
 
        cgrp = task_cs(current)->css.cgroup;
-       pr_info("%s cpuset=", current->comm);
+       pr_cont(",cpuset=");
        pr_cont_cgroup_name(cgrp);
-       pr_cont(" mems_allowed=%*pbl\n",
+       pr_cont(",mems_allowed=%*pbl",
                nodemask_pr_args(&current->mems_allowed));
 
        rcu_read_unlock();