Merge branch 'for-5.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Aug 2021 22:49:04 +0000 (15:49 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Aug 2021 22:49:04 +0000 (15:49 -0700)
Pull cgroup updates from Tejun Heo:
 "Two cpuset behavior changes:

   - cpuset on cgroup2 is changed to enable memory migration based on
     nodemask by default.

   - A notification is generated when cpuset partition state changes.

  All other patches are minor fixes and cleanups"

* 'for-5.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup: Avoid compiler warnings with no subsystems
  cgroup/cpuset: Avoid memory migration when nodemasks match
  cgroup/cpuset: Enable memory migration for cpuset v2
  cgroup/cpuset: Enable event notification when partition state changes
  cgroup: cgroup-v1: clean up kernel-doc notation
  cgroup: Replace deprecated CPU-hotplug functions.
  cgroup/cpuset: Fix violation of cpuset locking rule
  cgroup/cpuset: Fix a partition bug with hotplug
  cgroup/cpuset: Miscellaneous code cleanup
  cgroup: remove cgroup_mount from comments

Documentation/admin-guide/cgroup-v2.rst
kernel/cgroup/cgroup-v1.c
kernel/cgroup/cgroup.c
kernel/cgroup/cpuset.c

index 5c7377b..babbe04 100644 (file)
@@ -2056,6 +2056,17 @@ Cpuset Interface Files
        The value of "cpuset.mems" stays constant until the next update
        and won't be affected by any memory nodes hotplug events.
 
+       Setting a non-empty value to "cpuset.mems" causes memory of
+       tasks within the cgroup to be migrated to the designated nodes if
+       they are currently using memory outside of the designated nodes.
+
+       There is a cost for this memory migration.  The migration
+       may not be complete and some memory pages may be left behind.
+       So it is recommended that "cpuset.mems" should be set properly
+       before spawning new tasks into the cpuset.  Even if there is
+       a need to change "cpuset.mems" with active tasks, it shouldn't
+       be done frequently.
+
   cpuset.mems.effective
        A read-only multiple values file which exists on all
        cpuset-enabled cgroups.
index de2c432..35b9203 100644 (file)
@@ -50,6 +50,8 @@ bool cgroup1_ssid_disabled(int ssid)
  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
  * @from: attach to all cgroups of a given task
  * @tsk: the task to be attached
+ *
+ * Return: %0 on success or a negative errno code on failure
  */
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 {
@@ -80,7 +82,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 
 /**
- * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * cgroup_transfer_tasks - move tasks from one cgroup to another
  * @to: cgroup to which the tasks will be moved
  * @from: cgroup in which the tasks currently reside
  *
@@ -89,6 +91,8 @@ EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
  * is guaranteed to be either visible in the source cgroup after the
  * parent's migration is complete or put into the target cgroup.  No task
  * can slip out of migration through forking.
+ *
+ * Return: %0 on success or a negative errno code on failure
  */
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 {
@@ -682,6 +686,8 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
  *
  * Build and fill cgroupstats so that taskstats can export it to user
  * space.
+ *
+ * Return: %0 on success or a negative errno code on failure
  */
 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 {
index 3a0161c..881ce14 100644 (file)
 /* let's not notify more than 100 times per second */
 #define CGROUP_FILE_NOTIFY_MIN_INTV    DIV_ROUND_UP(HZ, 100)
 
+/*
+ * To avoid confusing the compiler (and generating warnings) with code
+ * that attempts to access what would be a 0-element array (i.e. sized
+ * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
+ * constant expression can be added.
+ */
+#define CGROUP_HAS_SUBSYS_CONFIG       (CGROUP_SUBSYS_COUNT > 0)
+
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
@@ -248,7 +256,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
  */
 bool cgroup_ssid_enabled(int ssid)
 {
-       if (CGROUP_SUBSYS_COUNT == 0)
+       if (!CGROUP_HAS_SUBSYS_CONFIG)
                return false;
 
        return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
@@ -472,7 +480,7 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
 static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
                                              struct cgroup_subsys *ss)
 {
-       if (ss)
+       if (CGROUP_HAS_SUBSYS_CONFIG && ss)
                return rcu_dereference_check(cgrp->subsys[ss->id],
                                        lockdep_is_held(&cgroup_mutex));
        else
@@ -550,6 +558,9 @@ struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
 {
        struct cgroup_subsys_state *css;
 
+       if (!CGROUP_HAS_SUBSYS_CONFIG)
+               return NULL;
+
        do {
                css = cgroup_css(cgrp, ss);
 
@@ -577,6 +588,9 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
 {
        struct cgroup_subsys_state *css;
 
+       if (!CGROUP_HAS_SUBSYS_CONFIG)
+               return NULL;
+
        rcu_read_lock();
 
        do {
@@ -647,7 +661,7 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
         * the matching css from the cgroup's subsys table is guaranteed to
         * be and stay valid until the enclosing operation is complete.
         */
-       if (cft->ss)
+       if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss)
                return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
        else
                return &cgrp->self;
@@ -695,7 +709,7 @@ EXPORT_SYMBOL_GPL(of_css);
  */
 #define do_each_subsys_mask(ss, ssid, ss_mask) do {                    \
        unsigned long __ss_mask = (ss_mask);                            \
-       if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
+       if (!CGROUP_HAS_SUBSYS_CONFIG) {                                \
                (ssid) = 0;                                             \
                break;                                                  \
        }                                                               \
@@ -2169,7 +2183,6 @@ static void cgroup_kill_sb(struct super_block *sb)
        /*
         * If @root doesn't have any children, start killing it.
         * This prevents new mounts by disabling percpu_ref_tryget_live().
-        * cgroup_mount() may wait for @root's release.
         *
         * And don't kill the default root.
         */
@@ -2373,7 +2386,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
        struct css_set *cset = tset->cur_cset;
        struct task_struct *task = tset->cur_task;
 
-       while (&cset->mg_node != tset->csets) {
+       while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) {
                if (!task)
                        task = list_first_entry(&cset->mg_tasks,
                                                struct task_struct, cg_list);
@@ -4644,7 +4657,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
        it->ss = css->ss;
        it->flags = flags;
 
-       if (it->ss)
+       if (CGROUP_HAS_SUBSYS_CONFIG && it->ss)
                it->cset_pos = &css->cgroup->e_csets[css->ss->id];
        else
                it->cset_pos = &css->cgroup->cset_links;
index 6500cbe..df1ccf4 100644 (file)
@@ -160,6 +160,9 @@ struct cpuset {
         */
        int use_parent_ecpus;
        int child_ecpus_count;
+
+       /* Handle for cpuset.cpus.partition */
+       struct cgroup_file partition_file;
 };
 
 /*
@@ -263,6 +266,16 @@ static inline int is_partition_root(const struct cpuset *cs)
        return cs->partition_root_state > 0;
 }
 
+/*
+ * Send notification event of whenever partition_root_state changes.
+ */
+static inline void notify_partition_change(struct cpuset *cs,
+                                          int old_prs, int new_prs)
+{
+       if (old_prs != new_prs)
+               cgroup_file_notify(&cs->partition_file);
+}
+
 static struct cpuset top_cpuset = {
        .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
                  (1 << CS_MEM_EXCLUSIVE)),
@@ -992,7 +1005,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
  * 'cpus' is removed, then call this routine to rebuild the
  * scheduler's dynamic sched domains.
  *
- * Call with cpuset_mutex held.  Takes get_online_cpus().
+ * Call with cpuset_mutex held.  Takes cpus_read_lock().
  */
 static void rebuild_sched_domains_locked(void)
 {
@@ -1053,11 +1066,11 @@ static void rebuild_sched_domains_locked(void)
 
 void rebuild_sched_domains(void)
 {
-       get_online_cpus();
+       cpus_read_lock();
        percpu_down_write(&cpuset_rwsem);
        rebuild_sched_domains_locked();
        percpu_up_write(&cpuset_rwsem);
-       put_online_cpus();
+       cpus_read_unlock();
 }
 
 /**
@@ -1127,7 +1140,7 @@ enum subparts_cmd {
  * cpus_allowed can be granted or an error code will be returned.
  *
  * For partcmd_disable, the cpuset is being transofrmed from a partition
- * root back to a non-partition root. any CPUs in cpus_allowed that are in
+ * root back to a non-partition root. Any CPUs in cpus_allowed that are in
  * parent's subparts_cpus will be taken away from that cpumask and put back
  * into parent's effective_cpus. 0 should always be returned.
  *
@@ -1161,6 +1174,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
        struct cpuset *parent = parent_cs(cpuset);
        int adding;     /* Moving cpus from effective_cpus to subparts_cpus */
        int deleting;   /* Moving cpus from subparts_cpus to effective_cpus */
+       int old_prs, new_prs;
        bool part_error = false;        /* Partition error? */
 
        percpu_rwsem_assert_held(&cpuset_rwsem);
@@ -1196,6 +1210,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
         * A cpumask update cannot make parent's effective_cpus become empty.
         */
        adding = deleting = false;
+       old_prs = new_prs = cpuset->partition_root_state;
        if (cmd == partcmd_enable) {
                cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
                adding = true;
@@ -1238,7 +1253,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
                /*
                 * partcmd_update w/o newmask:
                 *
-                * addmask = cpus_allowed & parent->effectiveb_cpus
+                * addmask = cpus_allowed & parent->effective_cpus
                 *
                 * Note that parent's subparts_cpus may have been
                 * pre-shrunk in case there is a change in the cpu list.
@@ -1260,11 +1275,11 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
                switch (cpuset->partition_root_state) {
                case PRS_ENABLED:
                        if (part_error)
-                               cpuset->partition_root_state = PRS_ERROR;
+                               new_prs = PRS_ERROR;
                        break;
                case PRS_ERROR:
                        if (!part_error)
-                               cpuset->partition_root_state = PRS_ENABLED;
+                               new_prs = PRS_ENABLED;
                        break;
                }
                /*
@@ -1273,10 +1288,10 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
                part_error = (prev_prs == PRS_ERROR);
        }
 
-       if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
+       if (!part_error && (new_prs == PRS_ERROR))
                return 0;       /* Nothing need to be done */
 
-       if (cpuset->partition_root_state == PRS_ERROR) {
+       if (new_prs == PRS_ERROR) {
                /*
                 * Remove all its cpus from parent's subparts_cpus.
                 */
@@ -1285,7 +1300,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
                                       parent->subparts_cpus);
        }
 
-       if (!adding && !deleting)
+       if (!adding && !deleting && (new_prs == old_prs))
                return 0;
 
        /*
@@ -1312,7 +1327,12 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
        }
 
        parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
+
+       if (old_prs != new_prs)
+               cpuset->partition_root_state = new_prs;
+
        spin_unlock_irq(&callback_lock);
+       notify_partition_change(cpuset, old_prs, new_prs);
 
        return cmd == partcmd_update;
 }
@@ -1334,6 +1354,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;
        bool need_rebuild_sched_domains = false;
+       int old_prs, new_prs;
 
        rcu_read_lock();
        cpuset_for_each_descendant_pre(cp, pos_css, cs) {
@@ -1373,17 +1394,18 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
                 * update_tasks_cpumask() again for tasks in the parent
                 * cpuset if the parent's subparts_cpus changes.
                 */
-               if ((cp != cs) && cp->partition_root_state) {
+               old_prs = new_prs = cp->partition_root_state;
+               if ((cp != cs) && old_prs) {
                        switch (parent->partition_root_state) {
                        case PRS_DISABLED:
                                /*
                                 * If parent is not a partition root or an
-                                * invalid partition root, clear the state
-                                * state and the CS_CPU_EXCLUSIVE flag.
+                                * invalid partition root, clear its state
+                                * and its CS_CPU_EXCLUSIVE flag.
                                 */
                                WARN_ON_ONCE(cp->partition_root_state
                                             != PRS_ERROR);
-                               cp->partition_root_state = 0;
+                               new_prs = PRS_DISABLED;
 
                                /*
                                 * clear_bit() is an atomic operation and
@@ -1404,11 +1426,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
                                /*
                                 * When parent is invalid, it has to be too.
                                 */
-                               cp->partition_root_state = PRS_ERROR;
-                               if (cp->nr_subparts_cpus) {
-                                       cp->nr_subparts_cpus = 0;
-                                       cpumask_clear(cp->subparts_cpus);
-                               }
+                               new_prs = PRS_ERROR;
                                break;
                        }
                }
@@ -1420,8 +1438,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
                spin_lock_irq(&callback_lock);
 
                cpumask_copy(cp->effective_cpus, tmp->new_cpus);
-               if (cp->nr_subparts_cpus &&
-                  (cp->partition_root_state != PRS_ENABLED)) {
+               if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) {
                        cp->nr_subparts_cpus = 0;
                        cpumask_clear(cp->subparts_cpus);
                } else if (cp->nr_subparts_cpus) {
@@ -1448,7 +1465,12 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
                                        = cpumask_weight(cp->subparts_cpus);
                        }
                }
+
+               if (new_prs != old_prs)
+                       cp->partition_root_state = new_prs;
+
                spin_unlock_irq(&callback_lock);
+               notify_partition_change(cp, old_prs, new_prs);
 
                WARN_ON(!is_in_v2_mode() &&
                        !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -1625,6 +1647,11 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 {
        struct cpuset_migrate_mm_work *mwork;
 
+       if (nodes_equal(*from, *to)) {
+               mmput(mm);
+               return;
+       }
+
        mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
        if (mwork) {
                mwork->mm = mm;
@@ -1950,34 +1977,32 @@ out:
 
 /*
  * update_prstate - update partititon_root_state
- * cs: the cpuset to update
- * val: 0 - disabled, 1 - enabled
+ * cs: the cpuset to update
+ * new_prs: new partition root state
  *
  * Call with cpuset_mutex held.
  */
-static int update_prstate(struct cpuset *cs, int val)
+static int update_prstate(struct cpuset *cs, int new_prs)
 {
-       int err;
+       int err, old_prs = cs->partition_root_state;
        struct cpuset *parent = parent_cs(cs);
-       struct tmpmasks tmp;
+       struct tmpmasks tmpmask;
 
-       if ((val != 0) && (val != 1))
-               return -EINVAL;
-       if (val == cs->partition_root_state)
+       if (old_prs == new_prs)
                return 0;
 
        /*
         * Cannot force a partial or invalid partition root to a full
         * partition root.
         */
-       if (val && cs->partition_root_state)
+       if (new_prs && (old_prs == PRS_ERROR))
                return -EINVAL;
 
-       if (alloc_cpumasks(NULL, &tmp))
+       if (alloc_cpumasks(NULL, &tmpmask))
                return -ENOMEM;
 
        err = -EINVAL;
-       if (!cs->partition_root_state) {
+       if (!old_prs) {
                /*
                 * Turning on partition root requires setting the
                 * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
@@ -1991,31 +2016,27 @@ static int update_prstate(struct cpuset *cs, int val)
                        goto out;
 
                err = update_parent_subparts_cpumask(cs, partcmd_enable,
-                                                    NULL, &tmp);
+                                                    NULL, &tmpmask);
                if (err) {
                        update_flag(CS_CPU_EXCLUSIVE, cs, 0);
                        goto out;
                }
-               cs->partition_root_state = PRS_ENABLED;
        } else {
                /*
                 * Turning off partition root will clear the
                 * CS_CPU_EXCLUSIVE bit.
                 */
-               if (cs->partition_root_state == PRS_ERROR) {
-                       cs->partition_root_state = 0;
+               if (old_prs == PRS_ERROR) {
                        update_flag(CS_CPU_EXCLUSIVE, cs, 0);
                        err = 0;
                        goto out;
                }
 
                err = update_parent_subparts_cpumask(cs, partcmd_disable,
-                                                    NULL, &tmp);
+                                                    NULL, &tmpmask);
                if (err)
                        goto out;
 
-               cs->partition_root_state = 0;
-
                /* Turning off CS_CPU_EXCLUSIVE will not return error */
                update_flag(CS_CPU_EXCLUSIVE, cs, 0);
        }
@@ -2028,11 +2049,18 @@ static int update_prstate(struct cpuset *cs, int val)
                update_tasks_cpumask(parent);
 
        if (parent->child_ecpus_count)
-               update_sibling_cpumasks(parent, cs, &tmp);
+               update_sibling_cpumasks(parent, cs, &tmpmask);
 
        rebuild_sched_domains_locked();
 out:
-       free_cpumasks(NULL, &tmp);
+       if (!err) {
+               spin_lock_irq(&callback_lock);
+               cs->partition_root_state = new_prs;
+               spin_unlock_irq(&callback_lock);
+               notify_partition_change(cs, old_prs, new_prs);
+       }
+
+       free_cpumasks(NULL, &tmpmask);
        return err;
 }
 
@@ -2293,7 +2321,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
        cpuset_filetype_t type = cft->private;
        int retval = 0;
 
-       get_online_cpus();
+       cpus_read_lock();
        percpu_down_write(&cpuset_rwsem);
        if (!is_cpuset_online(cs)) {
                retval = -ENODEV;
@@ -2331,7 +2359,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
        }
 out_unlock:
        percpu_up_write(&cpuset_rwsem);
-       put_online_cpus();
+       cpus_read_unlock();
        return retval;
 }
 
@@ -2342,7 +2370,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
        cpuset_filetype_t type = cft->private;
        int retval = -ENODEV;
 
-       get_online_cpus();
+       cpus_read_lock();
        percpu_down_write(&cpuset_rwsem);
        if (!is_cpuset_online(cs))
                goto out_unlock;
@@ -2357,7 +2385,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
        }
 out_unlock:
        percpu_up_write(&cpuset_rwsem);
-       put_online_cpus();
+       cpus_read_unlock();
        return retval;
 }
 
@@ -2396,7 +2424,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
        kernfs_break_active_protection(of->kn);
        flush_work(&cpuset_hotplug_work);
 
-       get_online_cpus();
+       cpus_read_lock();
        percpu_down_write(&cpuset_rwsem);
        if (!is_cpuset_online(cs))
                goto out_unlock;
@@ -2422,7 +2450,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
        free_cpuset(trialcs);
 out_unlock:
        percpu_up_write(&cpuset_rwsem);
-       put_online_cpus();
+       cpus_read_unlock();
        kernfs_unbreak_active_protection(of->kn);
        css_put(&cs->css);
        flush_workqueue(cpuset_migrate_mm_wq);
@@ -2553,7 +2581,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
                return -EINVAL;
 
        css_get(&cs->css);
-       get_online_cpus();
+       cpus_read_lock();
        percpu_down_write(&cpuset_rwsem);
        if (!is_cpuset_online(cs))
                goto out_unlock;
@@ -2561,7 +2589,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
        retval = update_prstate(cs, val);
 out_unlock:
        percpu_up_write(&cpuset_rwsem);
-       put_online_cpus();
+       cpus_read_unlock();
        css_put(&cs->css);
        return retval ?: nbytes;
 }
@@ -2713,6 +2741,7 @@ static struct cftype dfl_files[] = {
                .write = sched_partition_write,
                .private = FILE_PARTITION_ROOT,
                .flags = CFTYPE_NOT_ON_ROOT,
+               .file_offset = offsetof(struct cpuset, partition_file),
        },
 
        {
@@ -2748,12 +2777,16 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
                return ERR_PTR(-ENOMEM);
        }
 
-       set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+       __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
        nodes_clear(cs->mems_allowed);
        nodes_clear(cs->effective_mems);
        fmeter_init(&cs->fmeter);
        cs->relax_domain_level = -1;
 
+       /* Set CS_MEMORY_MIGRATE for default hierarchy */
+       if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+               __set_bit(CS_MEMORY_MIGRATE, &cs->flags);
+
        return &cs->css;
 }
 
@@ -2767,7 +2800,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
        if (!parent)
                return 0;
 
-       get_online_cpus();
+       cpus_read_lock();
        percpu_down_write(&cpuset_rwsem);
 
        set_bit(CS_ONLINE, &cs->flags);
@@ -2820,7 +2853,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
        spin_unlock_irq(&callback_lock);
 out_unlock:
        percpu_up_write(&cpuset_rwsem);
-       put_online_cpus();
+       cpus_read_unlock();
        return 0;
 }
 
@@ -2839,7 +2872,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
 {
        struct cpuset *cs = css_cs(css);
 
-       get_online_cpus();
+       cpus_read_lock();
        percpu_down_write(&cpuset_rwsem);
 
        if (is_partition_root(cs))
@@ -2860,7 +2893,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
        clear_bit(CS_ONLINE, &cs->flags);
 
        percpu_up_write(&cpuset_rwsem);
-       put_online_cpus();
+       cpus_read_unlock();
 }
 
 static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -3071,7 +3104,7 @@ retry:
                goto retry;
        }
 
-       parent =  parent_cs(cs);
+       parent = parent_cs(cs);
        compute_effective_cpumask(&new_cpus, cs, parent);
        nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
 
@@ -3093,8 +3126,10 @@ retry:
        if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
           (parent->partition_root_state == PRS_ERROR))) {
                if (cs->nr_subparts_cpus) {
+                       spin_lock_irq(&callback_lock);
                        cs->nr_subparts_cpus = 0;
                        cpumask_clear(cs->subparts_cpus);
+                       spin_unlock_irq(&callback_lock);
                        compute_effective_cpumask(&new_cpus, cs, parent);
                }
 
@@ -3106,9 +3141,17 @@ retry:
                 */
                if ((parent->partition_root_state == PRS_ERROR) ||
                     cpumask_empty(&new_cpus)) {
+                       int old_prs;
+
                        update_parent_subparts_cpumask(cs, partcmd_disable,
                                                       NULL, tmp);
-                       cs->partition_root_state = PRS_ERROR;
+                       old_prs = cs->partition_root_state;
+                       if (old_prs != PRS_ERROR) {
+                               spin_lock_irq(&callback_lock);
+                               cs->partition_root_state = PRS_ERROR;
+                               spin_unlock_irq(&callback_lock);
+                               notify_partition_change(cs, old_prs, PRS_ERROR);
+                       }
                }
                cpuset_force_rebuild();
        }
@@ -3179,6 +3222,13 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
        cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
        mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
 
+       /*
+        * In the rare case that hotplug removes all the cpus in subparts_cpus,
+        * we assumed that cpus are updated.
+        */
+       if (!cpus_updated && top_cpuset.nr_subparts_cpus)
+               cpus_updated = true;
+
        /* synchronize cpus_allowed to cpu_active_mask */
        if (cpus_updated) {
                spin_lock_irq(&callback_lock);