Merge branch 'linus' into sched/core, to resolve semantic conflict
[linux-2.6-microblaze.git] / kernel / sched / core.c
index e7e4534..a7abbba 100644 (file)
@@ -97,7 +97,7 @@ int sysctl_sched_rt_runtime = 950000;
  *
  * Normal scheduling state is serialized by rq->lock. __schedule() takes the
  * local CPU's rq->lock, it optionally removes the task from the runqueue and
- * always looks at the local rq data structures to find the most elegible task
+ * always looks at the local rq data structures to find the most eligible task
  * to run next.
  *
  * Task enqueue is also under rq->lock, possibly taken from another CPU.
@@ -320,14 +320,6 @@ void update_rq_clock(struct rq *rq)
        update_rq_clock_task(rq, delta);
 }
 
-static inline void
-rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
-{
-       csd->flags = 0;
-       csd->func = func;
-       csd->info = rq;
-}
-
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
@@ -428,7 +420,7 @@ void hrtick_start(struct rq *rq, u64 delay)
 static void hrtick_rq_init(struct rq *rq)
 {
 #ifdef CONFIG_SMP
-       rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
+       INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
 #endif
        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        rq->hrtick_timer.function = hrtick;
@@ -518,7 +510,7 @@ static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
 
        /*
         * Atomically grab the task, if ->wake_q is !nil already it means
-        * its already queued (either by us or someone else) and will get the
+        * it's already queued (either by us or someone else) and will get the
         * wakeup due to that.
         *
         * In order to ensure that a pending wakeup will observe our pending
@@ -769,7 +761,7 @@ bool sched_can_stop_tick(struct rq *rq)
                return false;
 
        /*
-        * If there are more than one RR tasks, we need the tick to effect the
+        * If there are more than one RR tasks, we need the tick to affect the
         * actual RR behaviour.
         */
        if (rq->rt.rr_nr_running) {
@@ -1187,14 +1179,14 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
         * accounting was performed at enqueue time and we can just return
         * here.
         *
-        * Need to be careful of the following enqeueue/dequeue ordering
+        * Need to be careful of the following enqueue/dequeue ordering
         * problem too
         *
         *      enqueue(taskA)
         *      // sched_uclamp_used gets enabled
         *      enqueue(taskB)
         *      dequeue(taskA)
-        *      // Must not decrement bukcet->tasks here
+        *      // Must not decrement bucket->tasks here
         *      dequeue(taskB)
         *
         * where we could end up with stale data in uc_se and
@@ -1413,17 +1405,24 @@ done:
 static int uclamp_validate(struct task_struct *p,
                           const struct sched_attr *attr)
 {
-       unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
-       unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
+       int util_min = p->uclamp_req[UCLAMP_MIN].value;
+       int util_max = p->uclamp_req[UCLAMP_MAX].value;
 
-       if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
-               lower_bound = attr->sched_util_min;
-       if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
-               upper_bound = attr->sched_util_max;
+       if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+               util_min = attr->sched_util_min;
 
-       if (lower_bound > upper_bound)
-               return -EINVAL;
-       if (upper_bound > SCHED_CAPACITY_SCALE)
+               if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
+                       return -EINVAL;
+       }
+
+       if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+               util_max = attr->sched_util_max;
+
+               if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
+                       return -EINVAL;
+       }
+
+       if (util_min != -1 && util_max != -1 && util_min > util_max)
                return -EINVAL;
 
        /*
@@ -1438,20 +1437,41 @@ static int uclamp_validate(struct task_struct *p,
        return 0;
 }
 
+static bool uclamp_reset(const struct sched_attr *attr,
+                        enum uclamp_id clamp_id,
+                        struct uclamp_se *uc_se)
+{
+       /* Reset on sched class change for a non user-defined clamp value. */
+       if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
+           !uc_se->user_defined)
+               return true;
+
+       /* Reset on sched_util_{min,max} == -1. */
+       if (clamp_id == UCLAMP_MIN &&
+           attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
+           attr->sched_util_min == -1) {
+               return true;
+       }
+
+       if (clamp_id == UCLAMP_MAX &&
+           attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
+           attr->sched_util_max == -1) {
+               return true;
+       }
+
+       return false;
+}
+
 static void __setscheduler_uclamp(struct task_struct *p,
                                  const struct sched_attr *attr)
 {
        enum uclamp_id clamp_id;
 
-       /*
-        * On scheduling class change, reset to default clamps for tasks
-        * without a task-specific value.
-        */
        for_each_clamp_id(clamp_id) {
                struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
+               unsigned int value;
 
-               /* Keep using defined clamps across class changes */
-               if (uc_se->user_defined)
+               if (!uclamp_reset(attr, clamp_id, uc_se))
                        continue;
 
                /*
@@ -1459,21 +1479,25 @@ static void __setscheduler_uclamp(struct task_struct *p,
                 * at runtime.
                 */
                if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
-                       __uclamp_update_util_min_rt_default(p);
+                       value = sysctl_sched_uclamp_util_min_rt_default;
                else
-                       uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
+                       value = uclamp_none(clamp_id);
+
+               uclamp_se_set(uc_se, value, false);
 
        }
 
        if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
                return;
 
-       if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+       if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
+           attr->sched_util_min != -1) {
                uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
                              attr->sched_util_min, true);
        }
 
-       if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+       if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
+           attr->sched_util_max != -1) {
                uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
                              attr->sched_util_max, true);
        }
@@ -1696,6 +1720,76 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 
 #ifdef CONFIG_SMP
 
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+                                 const struct cpumask *new_mask,
+                                 u32 flags);
+
+static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
+{
+       if (likely(!p->migration_disabled))
+               return;
+
+       if (p->cpus_ptr != &p->cpus_mask)
+               return;
+
+       /*
+        * Violates locking rules! see comment in __do_set_cpus_allowed().
+        */
+       __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
+}
+
+void migrate_disable(void)
+{
+       struct task_struct *p = current;
+
+       if (p->migration_disabled) {
+               p->migration_disabled++;
+               return;
+       }
+
+       preempt_disable();
+       this_rq()->nr_pinned++;
+       p->migration_disabled = 1;
+       preempt_enable();
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
+
+void migrate_enable(void)
+{
+       struct task_struct *p = current;
+
+       if (p->migration_disabled > 1) {
+               p->migration_disabled--;
+               return;
+       }
+
+       /*
+        * Ensure stop_task runs either before or after this, and that
+        * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+        */
+       preempt_disable();
+       if (p->cpus_ptr != &p->cpus_mask)
+               __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+       /*
+        * Mustn't clear migration_disabled() until cpus_ptr points back at the
+        * regular cpus_mask, otherwise things that race (eg.
+        * select_fallback_rq) get confused.
+        */
+       barrier();
+       p->migration_disabled = 0;
+       this_rq()->nr_pinned--;
+       preempt_enable();
+}
+EXPORT_SYMBOL_GPL(migrate_enable);
+
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+       return rq->nr_pinned;
+}
+
 /*
  * Per-CPU kthreads are allowed to run on !active && online CPUs, see
  * __set_cpus_allowed_ptr() and select_fallback_rq().
@@ -1705,7 +1799,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
        if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                return false;
 
-       if (is_per_cpu_kthread(p))
+       if (is_per_cpu_kthread(p) || is_migration_disabled(p))
                return cpu_online(cpu);
 
        return cpu_active(cpu);
@@ -1750,8 +1844,16 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
 }
 
 struct migration_arg {
-       struct task_struct *task;
-       int dest_cpu;
+       struct task_struct              *task;
+       int                             dest_cpu;
+       struct set_affinity_pending     *pending;
+};
+
+struct set_affinity_pending {
+       refcount_t              refs;
+       struct completion       done;
+       struct cpu_stop_work    stop_work;
+       struct migration_arg    arg;
 };
 
 /*
@@ -1783,16 +1885,19 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
  */
 static int migration_cpu_stop(void *data)
 {
+       struct set_affinity_pending *pending;
        struct migration_arg *arg = data;
        struct task_struct *p = arg->task;
+       int dest_cpu = arg->dest_cpu;
        struct rq *rq = this_rq();
+       bool complete = false;
        struct rq_flags rf;
 
        /*
         * The original target CPU might have gone down and we might
         * be on another CPU but it doesn't matter.
         */
-       local_irq_disable();
+       local_irq_save(rf.flags);
        /*
         * We need to explicitly wake pending tasks before running
         * __migrate_task() such that we will not miss enforcing cpus_ptr
@@ -1802,21 +1907,137 @@ static int migration_cpu_stop(void *data)
 
        raw_spin_lock(&p->pi_lock);
        rq_lock(rq, &rf);
+
+       pending = p->migration_pending;
        /*
         * If task_rq(p) != rq, it cannot be migrated here, because we're
         * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
         * we're holding p->pi_lock.
         */
        if (task_rq(p) == rq) {
+               if (is_migration_disabled(p))
+                       goto out;
+
+               if (pending) {
+                       p->migration_pending = NULL;
+                       complete = true;
+               }
+
+               /* migrate_enable() --  we must not race against SCA */
+               if (dest_cpu < 0) {
+                       /*
+                        * When this was migrate_enable() but we no longer
+                        * have a @pending, a concurrent SCA 'fixed' things
+                        * and we should be valid again. Nothing to do.
+                        */
+                       if (!pending) {
+                               WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
+                               goto out;
+                       }
+
+                       dest_cpu = cpumask_any_distribute(&p->cpus_mask);
+               }
+
                if (task_on_rq_queued(p))
-                       rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
+                       rq = __migrate_task(rq, &rf, p, dest_cpu);
                else
-                       p->wake_cpu = arg->dest_cpu;
+                       p->wake_cpu = dest_cpu;
+
+       } else if (dest_cpu < 0 || pending) {
+               /*
+                * This happens when we get migrated between migrate_enable()'s
+                * preempt_enable() and scheduling the stopper task. At that
+                * point we're a regular task again and not current anymore.
+                *
+                * A !PREEMPT kernel has a giant hole here, which makes it far
+                * more likely.
+                */
+
+               /*
+                * The task moved before the stopper got to run. We're holding
+                * ->pi_lock, so the allowed mask is stable - if it got
+                * somewhere allowed, we're done.
+                */
+               if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
+                       p->migration_pending = NULL;
+                       complete = true;
+                       goto out;
+               }
+
+               /*
+                * When this was migrate_enable() but we no longer have an
+                * @pending, a concurrent SCA 'fixed' things and we should be
+                * valid again. Nothing to do.
+                */
+               if (!pending) {
+                       WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
+                       goto out;
+               }
+
+               /*
+                * When migrate_enable() hits a rq mis-match we can't reliably
+                * determine is_migration_disabled() and so have to chase after
+                * it.
+                */
+               task_rq_unlock(rq, p, &rf);
+               stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
+                                   &pending->arg, &pending->stop_work);
+               return 0;
        }
-       rq_unlock(rq, &rf);
-       raw_spin_unlock(&p->pi_lock);
+out:
+       task_rq_unlock(rq, p, &rf);
+
+       if (complete)
+               complete_all(&pending->done);
+
+       /* For pending->{arg,stop_work} */
+       pending = arg->pending;
+       if (pending && refcount_dec_and_test(&pending->refs))
+               wake_up_var(&pending->refs);
 
-       local_irq_enable();
+       return 0;
+}
+
+int push_cpu_stop(void *arg)
+{
+       struct rq *lowest_rq = NULL, *rq = this_rq();
+       struct task_struct *p = arg;
+
+       raw_spin_lock_irq(&p->pi_lock);
+       raw_spin_lock(&rq->lock);
+
+       if (task_rq(p) != rq)
+               goto out_unlock;
+
+       if (is_migration_disabled(p)) {
+               p->migration_flags |= MDF_PUSH;
+               goto out_unlock;
+       }
+
+       p->migration_flags &= ~MDF_PUSH;
+
+       if (p->sched_class->find_lock_rq)
+               lowest_rq = p->sched_class->find_lock_rq(p, rq);
+
+       if (!lowest_rq)
+               goto out_unlock;
+
+       // XXX validate p is still the highest prio task
+       if (task_rq(p) == rq) {
+               deactivate_task(rq, p, 0);
+               set_task_cpu(p, lowest_rq->cpu);
+               activate_task(lowest_rq, p, 0);
+               resched_curr(lowest_rq);
+       }
+
+       double_unlock_balance(rq, lowest_rq);
+
+out_unlock:
+       rq->push_busy = false;
+       raw_spin_unlock(&rq->lock);
+       raw_spin_unlock_irq(&p->pi_lock);
+
+       put_task_struct(p);
        return 0;
 }
 
@@ -1824,18 +2045,39 @@ static int migration_cpu_stop(void *data)
  * sched_class::set_cpus_allowed must do the below, but is not required to
  * actually call this function.
  */
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
+       if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
+               p->cpus_ptr = new_mask;
+               return;
+       }
+
        cpumask_copy(&p->cpus_mask, new_mask);
        p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
 {
        struct rq *rq = task_rq(p);
        bool queued, running;
 
-       lockdep_assert_held(&p->pi_lock);
+       /*
+        * This here violates the locking rules for affinity, since we're only
+        * supposed to change these variables while holding both rq->lock and
+        * p->pi_lock.
+        *
+        * HOWEVER, it magically works, because ttwu() is the only code that
+        * accesses these variables under p->pi_lock and only does so after
+        * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
+        * before finish_task().
+        *
+        * XXX do further audits, this smells like something putrid.
+        */
+       if (flags & SCA_MIGRATE_DISABLE)
+               SCHED_WARN_ON(!p->on_cpu);
+       else
+               lockdep_assert_held(&p->pi_lock);
 
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
@@ -1851,7 +2093,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
        if (running)
                put_prev_task(rq, p);
 
-       p->sched_class->set_cpus_allowed(p, new_mask);
+       p->sched_class->set_cpus_allowed(p, new_mask, flags);
 
        if (queued)
                enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -1859,6 +2101,208 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
                set_next_task(rq, p);
 }
 
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+       __do_set_cpus_allowed(p, new_mask, 0);
+}
+
+/*
+ * This function is wildly self concurrent; here be dragons.
+ *
+ *
+ * When given a valid mask, __set_cpus_allowed_ptr() must block until the
+ * designated task is enqueued on an allowed CPU. If that task is currently
+ * running, we have to kick it out using the CPU stopper.
+ *
+ * Migrate-Disable comes along and tramples all over our nice sandcastle.
+ * Consider:
+ *
+ *     Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ *     P0@CPU0                  P1
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                              set_cpus_allowed_ptr(P0, [1]);
+ *
+ * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
+ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
+ * This means we need the following scheme:
+ *
+ *     P0@CPU0                  P1
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                              set_cpus_allowed_ptr(P0, [1]);
+ *                                <blocks>
+ *     <resumes>
+ *     migrate_enable();
+ *       __set_cpus_allowed_ptr();
+ *       <wakes local stopper>
+ *                         `--> <woken on migration completion>
+ *
+ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
+ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
+ * task p are serialized by p->pi_lock, which we can leverage: the one that
+ * should come into effect at the end of the Migrate-Disable region is the last
+ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
+ * but we still need to properly signal those waiting tasks at the appropriate
+ * moment.
+ *
+ * This is implemented using struct set_affinity_pending. The first
+ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
+ * setup an instance of that struct and install it on the targeted task_struct.
+ * Any and all further callers will reuse that instance. Those then wait for
+ * a completion signaled at the tail of the CPU stopper callback (1), triggered
+ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
+ *
+ *
+ * (1) In the cases covered above. There is one more where the completion is
+ * signaled within affine_move_task() itself: when a subsequent affinity request
+ * cancels the need for an active migration. Consider:
+ *
+ *     Initial conditions: P0->cpus_mask = [0, 1]
+ *
+ *     P0@CPU0            P1                             P2
+ *
+ *     migrate_disable();
+ *     <preempted>
+ *                        set_cpus_allowed_ptr(P0, [1]);
+ *                          <blocks>
+ *                                                       set_cpus_allowed_ptr(P0, [0, 1]);
+ *                                                         <signal completion>
+ *                          <awakes>
+ *
+ * Note that the above is safe vs a concurrent migrate_enable(), as any
+ * pending affinity completion is preceded by an uninstallation of
+ * p->migration_pending done with p->pi_lock held.
+ */
+static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
+                           int dest_cpu, unsigned int flags)
+{
+       struct set_affinity_pending my_pending = { }, *pending = NULL;
+       struct migration_arg arg = {
+               .task = p,
+               .dest_cpu = dest_cpu,
+       };
+       bool complete = false;
+
+       /* Can the task run on the task's current CPU? If so, we're done */
+       if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+               struct task_struct *push_task = NULL;
+
+               if ((flags & SCA_MIGRATE_ENABLE) &&
+                   (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
+                       rq->push_busy = true;
+                       push_task = get_task_struct(p);
+               }
+
+               pending = p->migration_pending;
+               if (pending) {
+                       refcount_inc(&pending->refs);
+                       p->migration_pending = NULL;
+                       complete = true;
+               }
+               task_rq_unlock(rq, p, rf);
+
+               if (push_task) {
+                       stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
+                                           p, &rq->push_work);
+               }
+
+               if (complete)
+                       goto do_complete;
+
+               return 0;
+       }
+
+       if (!(flags & SCA_MIGRATE_ENABLE)) {
+               /* serialized by p->pi_lock */
+               if (!p->migration_pending) {
+                       /* Install the request */
+                       refcount_set(&my_pending.refs, 1);
+                       init_completion(&my_pending.done);
+                       p->migration_pending = &my_pending;
+               } else {
+                       pending = p->migration_pending;
+                       refcount_inc(&pending->refs);
+               }
+       }
+       pending = p->migration_pending;
+       /*
+        * - !MIGRATE_ENABLE:
+        *   we'll have installed a pending if there wasn't one already.
+        *
+        * - MIGRATE_ENABLE:
+        *   we're here because the current CPU isn't matching anymore,
+        *   the only way that can happen is because of a concurrent
+        *   set_cpus_allowed_ptr() call, which should then still be
+        *   pending completion.
+        *
+        * Either way, we really should have a @pending here.
+        */
+       if (WARN_ON_ONCE(!pending)) {
+               task_rq_unlock(rq, p, rf);
+               return -EINVAL;
+       }
+
+       if (flags & SCA_MIGRATE_ENABLE) {
+
+               refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+               p->migration_flags &= ~MDF_PUSH;
+               task_rq_unlock(rq, p, rf);
+
+               pending->arg = (struct migration_arg) {
+                       .task = p,
+                       .dest_cpu = -1,
+                       .pending = pending,
+               };
+
+               stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+                                   &pending->arg, &pending->stop_work);
+
+               return 0;
+       }
+
+       if (task_running(rq, p) || p->state == TASK_WAKING) {
+               /*
+                * Lessen races (and headaches) by delegating
+                * is_migration_disabled(p) checks to the stopper, which will
+                * run on the same CPU as said p.
+                */
+               task_rq_unlock(rq, p, rf);
+               stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+
+       } else {
+
+               if (!is_migration_disabled(p)) {
+                       if (task_on_rq_queued(p))
+                               rq = move_queued_task(rq, rf, p, dest_cpu);
+
+                       p->migration_pending = NULL;
+                       complete = true;
+               }
+               task_rq_unlock(rq, p, rf);
+
+do_complete:
+               if (complete)
+                       complete_all(&pending->done);
+       }
+
+       wait_for_completion(&pending->done);
+
+       if (refcount_dec_and_test(&pending->refs))
+               wake_up_var(&pending->refs);
+
+       /*
+        * Block the original owner of &pending until all subsequent callers
+        * have seen the completion and decremented the refcount
+        */
+       wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+
+       return 0;
+}
+
 /*
  * Change a given task's CPU affinity. Migrate the thread to a
  * proper CPU and schedule it away if the CPU it's executing on
@@ -1869,7 +2313,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  * call is not atomic; no spinlocks may be held.
  */
 static int __set_cpus_allowed_ptr(struct task_struct *p,
-                                 const struct cpumask *new_mask, bool check)
+                                 const struct cpumask *new_mask,
+                                 u32 flags)
 {
        const struct cpumask *cpu_valid_mask = cpu_active_mask;
        unsigned int dest_cpu;
@@ -1880,9 +2325,14 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
        rq = task_rq_lock(p, &rf);
        update_rq_clock(rq);
 
-       if (p->flags & PF_KTHREAD) {
+       if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
                /*
-                * Kernel threads are allowed on online && !active CPUs
+                * Kernel threads are allowed on online && !active CPUs.
+                *
+                * Specifically, migration_disabled() tasks must not fail the
+                * cpumask_any_and_distribute() pick below, esp. so on
+                * SCA_MIGRATE_ENABLE, otherwise we'll not call
+                * set_cpus_allowed_common() and actually reset p->cpus_ptr.
                 */
                cpu_valid_mask = cpu_online_mask;
        }
@@ -1891,13 +2341,22 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
         * Must re-check here, to close a race against __kthread_bind(),
         * sched_setaffinity() is not guaranteed to observe the flag.
         */
-       if (check && (p->flags & PF_NO_SETAFFINITY)) {
+       if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
                ret = -EINVAL;
                goto out;
        }
 
-       if (cpumask_equal(&p->cpus_mask, new_mask))
-               goto out;
+       if (!(flags & SCA_MIGRATE_ENABLE)) {
+               if (cpumask_equal(&p->cpus_mask, new_mask))
+                       goto out;
+
+               if (WARN_ON_ONCE(p == current &&
+                                is_migration_disabled(p) &&
+                                !cpumask_test_cpu(task_cpu(p), new_mask))) {
+                       ret = -EBUSY;
+                       goto out;
+               }
+       }
 
        /*
         * Picking a ~random cpu helps in cases where we are changing affinity
@@ -1910,7 +2369,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
                goto out;
        }
 
-       do_set_cpus_allowed(p, new_mask);
+       __do_set_cpus_allowed(p, new_mask, flags);
 
        if (p->flags & PF_KTHREAD) {
                /*
@@ -1922,23 +2381,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
                        p->nr_cpus_allowed != 1);
        }
 
-       /* Can the task run on the task's current CPU? If so, we're done */
-       if (cpumask_test_cpu(task_cpu(p), new_mask))
-               goto out;
+       return affine_move_task(rq, p, &rf, dest_cpu, flags);
 
-       if (task_running(rq, p) || p->state == TASK_WAKING) {
-               struct migration_arg arg = { p, dest_cpu };
-               /* Need help from migration thread: drop lock and wait. */
-               task_rq_unlock(rq, p, &rf);
-               stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
-               return 0;
-       } else if (task_on_rq_queued(p)) {
-               /*
-                * OK, since we're going to drop the lock immediately
-                * afterwards anyway.
-                */
-               rq = move_queued_task(rq, &rf, p, dest_cpu);
-       }
 out:
        task_rq_unlock(rq, p, &rf);
 
@@ -1947,7 +2391,7 @@ out:
 
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
-       return __set_cpus_allowed_ptr(p, new_mask, false);
+       return __set_cpus_allowed_ptr(p, new_mask, 0);
 }
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
@@ -1988,6 +2432,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
         */
        WARN_ON_ONCE(!cpu_online(new_cpu));
+
+       WARN_ON_ONCE(is_migration_disabled(p));
 #endif
 
        trace_sched_migrate_task(p, new_cpu);
@@ -2318,6 +2764,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                        }
                        fallthrough;
                case possible:
+                       /*
+                        * XXX When called from select_task_rq() we only
+                        * hold p->pi_lock and again violate locking order.
+                        *
+                        * More yuck to audit.
+                        */
                        do_set_cpus_allowed(p, cpu_possible_mask);
                        state = fail;
                        break;
@@ -2348,12 +2800,12 @@ out:
  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
  */
 static inline
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
 {
        lockdep_assert_held(&p->pi_lock);
 
-       if (p->nr_cpus_allowed > 1)
-               cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+       if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
+               cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
        else
                cpu = cpumask_any(p->cpus_ptr);
 
@@ -2375,6 +2827,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 
 void sched_set_stop_task(int cpu, struct task_struct *stop)
 {
+       static struct lock_class_key stop_pi_lock;
        struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
        struct task_struct *old_stop = cpu_rq(cpu)->stop;
 
@@ -2390,6 +2843,20 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
                sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
 
                stop->sched_class = &stop_sched_class;
+
+               /*
+                * The PI code calls rt_mutex_setprio() with ->pi_lock held to
+                * adjust the effective priority of a task. As a result,
+                * rt_mutex_setprio() can trigger (RT) balancing operations,
+                * which can then trigger wakeups of the stop thread to push
+                * around the current task.
+                *
+                * The stop task itself will never be part of the PI-chain, it
+                * never blocks, therefore that ->pi_lock recursion is safe.
+                * Tell lockdep about this by placing the stop->pi_lock in its
+                * own class.
+                */
+               lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
        }
 
        cpu_rq(cpu)->stop = stop;
@@ -2403,15 +2870,23 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
        }
 }
 
-#else
+#else /* CONFIG_SMP */
 
 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
-                                        const struct cpumask *new_mask, bool check)
+                                        const struct cpumask *new_mask,
+                                        u32 flags)
 {
        return set_cpus_allowed_ptr(p, new_mask);
 }
 
-#endif /* CONFIG_SMP */
+static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
+
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+       return false;
+}
+
+#endif /* !CONFIG_SMP */
 
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -2465,7 +2940,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
 #ifdef CONFIG_SMP
        if (p->sched_class->task_woken) {
                /*
-                * Our task @p is fully woken up and running; so its safe to
+                * Our task @p is fully woken up and running; so it's safe to
                 * drop the rq->lock, hereafter rq is only used for statistics.
                 */
                rq_unpin_lock(rq, rf);
@@ -2952,7 +3427,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 
        /*
         * If the owning (remote) CPU is still in the middle of schedule() with
-        * this task as prev, wait until its done referencing the task.
+        * this task as prev, wait until it's done referencing the task.
         *
         * Pairs with the smp_store_release() in finish_task().
         *
@@ -2961,7 +3436,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         */
        smp_cond_load_acquire(&p->on_cpu, !VAL);
 
-       cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+       cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
        if (task_cpu(p) != cpu) {
                if (p->in_iowait) {
                        delayacct_blkio_end(p);
@@ -3103,6 +3578,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        init_numa_balancing(clone_flags, p);
 #ifdef CONFIG_SMP
        p->wake_entry.u_flags = CSD_TYPE_TTWU;
+       p->migration_pending = NULL;
 #endif
 }
 
@@ -3349,7 +3825,7 @@ void wake_up_new_task(struct task_struct *p)
         */
        p->recent_used_cpu = task_cpu(p);
        rseq_migrate(p);
-       __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+       __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK));
 #endif
        rq = __task_rq_lock(p, &rf);
        update_rq_clock(rq);
@@ -3361,7 +3837,7 @@ void wake_up_new_task(struct task_struct *p)
 #ifdef CONFIG_SMP
        if (p->sched_class->task_woken) {
                /*
-                * Nothing relies on rq->lock after this, so its fine to
+                * Nothing relies on rq->lock after this, so it's fine to
                 * drop it.
                 */
                rq_unpin_lock(rq, &rf);
@@ -3490,6 +3966,90 @@ static inline void finish_task(struct task_struct *prev)
 #endif
 }
 
+#ifdef CONFIG_SMP
+
+static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+       void (*func)(struct rq *rq);
+       struct callback_head *next;
+
+       lockdep_assert_held(&rq->lock);
+
+       while (head) {
+               func = (void (*)(struct rq *))head->func;
+               next = head->next;
+               head->next = NULL;
+               head = next;
+
+               func(rq);
+       }
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+       struct callback_head *head = rq->balance_callback;
+
+       lockdep_assert_held(&rq->lock);
+       if (head) {
+               rq->balance_callback = NULL;
+               rq->balance_flags &= ~BALANCE_WORK;
+       }
+
+       return head;
+}
+
+static void __balance_callbacks(struct rq *rq)
+{
+       do_balance_callbacks(rq, splice_balance_callbacks(rq));
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+       unsigned long flags;
+
+       if (unlikely(head)) {
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               do_balance_callbacks(rq, head);
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
+       }
+}
+
+static void balance_push(struct rq *rq);
+
+static inline void balance_switch(struct rq *rq)
+{
+       if (likely(!rq->balance_flags))
+               return;
+
+       if (rq->balance_flags & BALANCE_PUSH) {
+               balance_push(rq);
+               return;
+       }
+
+       __balance_callbacks(rq);
+}
+
+#else
+
+static inline void __balance_callbacks(struct rq *rq)
+{
+}
+
+static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+{
+       return NULL;
+}
+
+static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+{
+}
+
+static inline void balance_switch(struct rq *rq)
+{
+}
+
+#endif
+
 static inline void
 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
 {
@@ -3515,6 +4075,7 @@ static inline void finish_lock_switch(struct rq *rq)
         * prev into current:
         */
        spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+       balance_switch(rq);
        raw_spin_unlock_irq(&rq->lock);
 }
 
@@ -3656,43 +4217,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
        return rq;
 }
 
-#ifdef CONFIG_SMP
-
-/* rq->lock is NOT held, but preemption is disabled */
-static void __balance_callback(struct rq *rq)
-{
-       struct callback_head *head, *next;
-       void (*func)(struct rq *rq);
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       head = rq->balance_callback;
-       rq->balance_callback = NULL;
-       while (head) {
-               func = (void (*)(struct rq *))head->func;
-               next = head->next;
-               head->next = NULL;
-               head = next;
-
-               func(rq);
-       }
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-static inline void balance_callback(struct rq *rq)
-{
-       if (unlikely(rq->balance_callback))
-               __balance_callback(rq);
-}
-
-#else
-
-static inline void balance_callback(struct rq *rq)
-{
-}
-
-#endif
-
 /**
  * schedule_tail - first thing a freshly forked thread must call.
  * @prev: the thread we just switched away from.
@@ -3712,7 +4236,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
         */
 
        rq = finish_task_switch(prev);
-       balance_callback(rq);
        preempt_enable();
 
        if (current->set_child_tid)
@@ -3841,7 +4364,7 @@ unsigned long nr_iowait_cpu(int cpu)
 }
 
 /*
- * IO-wait accounting, and how its mostly bollocks (on SMP).
+ * IO-wait accounting, and how it's mostly bollocks (on SMP).
  *
  * The idea behind IO-wait account is to account the idle time that we could
  * have spend running if it were not for IO. That is, if we were to improve the
@@ -3893,7 +4416,7 @@ void sched_exec(void)
        int dest_cpu;
 
        raw_spin_lock_irqsave(&p->pi_lock, flags);
-       dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
+       dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
        if (dest_cpu == smp_processor_id())
                goto unlock;
 
@@ -4336,7 +4859,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
        /*
         * Optimization: we know that if all tasks are in the fair class we can
         * call that function directly, but only if the @prev task wasn't of a
-        * higher scheduling class, because otherwise those loose the
+        * higher scheduling class, because otherwise those lose the
         * opportunity to pull in more work from other CPUs.
         */
        if (likely(prev->sched_class <= &fair_sched_class &&
@@ -4520,6 +5043,7 @@ static void __sched notrace __schedule(bool preempt)
                 */
                ++*switch_count;
 
+               migrate_disable_switch(rq, prev);
                psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
                trace_sched_switch(preempt, prev, next);
@@ -4528,10 +5052,11 @@ static void __sched notrace __schedule(bool preempt)
                rq = context_switch(rq, prev, next, &rf);
        } else {
                rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-               rq_unlock_irq(rq, &rf);
-       }
 
-       balance_callback(rq);
+               rq_unpin_lock(rq, &rf);
+               __balance_callbacks(rq);
+               raw_spin_unlock_irq(&rq->lock);
+       }
 }
 
 void __noreturn do_task_dead(void)
@@ -4857,7 +5382,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
         * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
         * ensure a task is de-boosted (pi_task is set to NULL) before the
         * task is allowed to run again (and can exit). This ensures the pointer
-        * points to a blocked task -- which guaratees the task is present.
+        * points to a blocked task -- which guarantees the task is present.
         */
        p->pi_top_task = pi_task;
 
@@ -4943,9 +5468,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 out_unlock:
        /* Avoid rq from going away on us: */
        preempt_disable();
-       __task_rq_unlock(rq, &rf);
 
-       balance_callback(rq);
+       rq_unpin_lock(rq, &rf);
+       __balance_callbacks(rq);
+       raw_spin_unlock(&rq->lock);
+
        preempt_enable();
 }
 #else
@@ -4974,7 +5501,7 @@ void set_user_nice(struct task_struct *p, long nice)
        /*
         * The RT priorities are set via sched_setscheduler(), but we still
         * allow the 'normal' nice value to be set - but as expected
-        * it wont have any effect on scheduling until the task is
+        * it won't have any effect on scheduling until the task is
         * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
         */
        if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
@@ -5219,6 +5746,7 @@ static int __sched_setscheduler(struct task_struct *p,
        int retval, oldprio, oldpolicy = -1, queued, running;
        int new_effective_prio, policy = attr->sched_policy;
        const struct sched_class *prev_class;
+       struct callback_head *head;
        struct rq_flags rf;
        int reset_on_fork;
        int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -5457,6 +5985,7 @@ change:
 
        /* Avoid rq from going away on us: */
        preempt_disable();
+       head = splice_balance_callbacks(rq);
        task_rq_unlock(rq, p, &rf);
 
        if (pi) {
@@ -5465,7 +5994,7 @@ change:
        }
 
        /* Run balance callbacks after we've adjusted the PI chain: */
-       balance_callback(rq);
+       balance_callbacks(rq, head);
        preempt_enable();
 
        return 0;
@@ -5960,7 +6489,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        }
 #endif
 again:
-       retval = __set_cpus_allowed_ptr(p, new_mask, true);
+       retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
 
        if (!retval) {
                cpuset_cpus_allowed(p, cpus_allowed);
@@ -6100,12 +6629,8 @@ static void do_sched_yield(void)
        schedstat_inc(rq->yld_count);
        current->sched_class->yield_task(rq);
 
-       /*
-        * Since we are going to call schedule() anyway, there's
-        * no need to preempt or enable interrupts:
-        */
        preempt_disable();
-       rq_unlock(rq, &rf);
+       rq_unlock_irq(rq, &rf);
        sched_preempt_enable_no_resched();
 
        schedule();
@@ -6165,7 +6690,7 @@ EXPORT_SYMBOL(__cond_resched_lock);
  *
  * The scheduler is at all times free to pick the calling task as the most
  * eligible task to run, if removing the yield() call from your code breaks
- * it, its already broken.
+ * it, it's already broken.
  *
  * Typical broken usage is:
  *
@@ -6453,6 +6978,7 @@ void sched_show_task(struct task_struct *p)
                (unsigned long)task_thread_info(p)->flags);
 
        print_worker_info(KERN_INFO, p);
+       print_stop_info(KERN_INFO, p);
        show_stack(p, NULL, KERN_INFO);
        put_task_stack(p);
 }
@@ -6538,12 +7064,12 @@ void init_idle(struct task_struct *idle, int cpu)
 
 #ifdef CONFIG_SMP
        /*
-        * Its possible that init_idle() gets called multiple times on a task,
+        * It's possible that init_idle() gets called multiple times on a task,
         * in that case do_set_cpus_allowed() will not do the right thing.
         *
         * And since this is boot we can forgo the serialization.
         */
-       set_cpus_allowed_common(idle, cpumask_of(cpu));
+       set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
 #endif
        /*
         * We're having a chicken and egg problem, even though we are
@@ -6694,119 +7220,126 @@ void idle_task_exit(void)
        /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
 }
 
-/*
- * Since this CPU is going 'away' for a while, fold any nr_active delta
- * we might have. Assumes we're called after migrate_tasks() so that the
- * nr_active count is stable. We need to take the teardown thread which
- * is calling this into account, so we hand in adjust = 1 to the load
- * calculation.
- *
- * Also see the comment "Global load-average calculations".
- */
-static void calc_load_migrate(struct rq *rq)
+static int __balance_push_cpu_stop(void *arg)
 {
-       long delta = calc_load_fold_active(rq, 1);
-       if (delta)
-               atomic_long_add(delta, &calc_load_tasks);
-}
+       struct task_struct *p = arg;
+       struct rq *rq = this_rq();
+       struct rq_flags rf;
+       int cpu;
 
-static struct task_struct *__pick_migrate_task(struct rq *rq)
-{
-       const struct sched_class *class;
-       struct task_struct *next;
+       raw_spin_lock_irq(&p->pi_lock);
+       rq_lock(rq, &rf);
 
-       for_each_class(class) {
-               next = class->pick_next_task(rq);
-               if (next) {
-                       next->sched_class->put_prev_task(rq, next);
-                       return next;
-               }
+       update_rq_clock(rq);
+
+       if (task_rq(p) == rq && task_on_rq_queued(p)) {
+               cpu = select_fallback_rq(rq->cpu, p);
+               rq = __migrate_task(rq, &rf, p, cpu);
        }
 
-       /* The idle class should always have a runnable task */
-       BUG();
+       rq_unlock(rq, &rf);
+       raw_spin_unlock_irq(&p->pi_lock);
+
+       put_task_struct(p);
+
+       return 0;
 }
 
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
+
 /*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
- * try_to_wake_up()->select_task_rq().
- *
- * Called with rq->lock held even though we'er in stop_machine() and
- * there's no concurrency possible, we hold the required locks anyway
- * because of lock validation efforts.
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
  */
-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
+static void balance_push(struct rq *rq)
 {
-       struct rq *rq = dead_rq;
-       struct task_struct *next, *stop = rq->stop;
-       struct rq_flags orf = *rf;
-       int dest_cpu;
+       struct task_struct *push_task = rq->curr;
+
+       lockdep_assert_held(&rq->lock);
+       SCHED_WARN_ON(rq->cpu != smp_processor_id());
 
        /*
-        * Fudge the rq selection such that the below task selection loop
-        * doesn't get stuck on the currently eligible stop task.
-        *
-        * We're currently inside stop_machine() and the rq is either stuck
-        * in the stop_machine_cpu_stop() loop, or we're executing this code,
-        * either way we should never end up calling schedule() until we're
-        * done here.
+        * Both the cpu-hotplug and stop task are in this case and are
+        * required to complete the hotplug process.
         */
-       rq->stop = NULL;
+       if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
+               /*
+                * If this is the idle task on the outgoing CPU try to wake
+                * up the hotplug control thread which might wait for the
+                * last task to vanish. The rcuwait_active() check is
+                * accurate here because the waiter is pinned on this CPU
+                * and can't obviously be running in parallel.
+                *
+                * On RT kernels this also has to check whether there are
+                * pinned and scheduled out tasks on the runqueue. They
+                * need to leave the migrate disabled section first.
+                */
+               if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
+                   rcuwait_active(&rq->hotplug_wait)) {
+                       raw_spin_unlock(&rq->lock);
+                       rcuwait_wake_up(&rq->hotplug_wait);
+                       raw_spin_lock(&rq->lock);
+               }
+               return;
+       }
 
+       get_task_struct(push_task);
        /*
-        * put_prev_task() and pick_next_task() sched
-        * class method both need to have an up-to-date
-        * value of rq->clock[_task]
+        * Temporarily drop rq->lock such that we can wake-up the stop task.
+        * Both preemption and IRQs are still disabled.
         */
-       update_rq_clock(rq);
+       raw_spin_unlock(&rq->lock);
+       stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
+                           this_cpu_ptr(&push_work));
+       /*
+        * At this point need_resched() is true and we'll take the loop in
+        * schedule(). The next pick is obviously going to be the stop task
+        * which is_per_cpu_kthread() and will push this task away.
+        */
+       raw_spin_lock(&rq->lock);
+}
 
-       for (;;) {
-               /*
-                * There's this thread running, bail when that's the only
-                * remaining thread:
-                */
-               if (rq->nr_running == 1)
-                       break;
+static void balance_push_set(int cpu, bool on)
+{
+       struct rq *rq = cpu_rq(cpu);
+       struct rq_flags rf;
 
-               next = __pick_migrate_task(rq);
+       rq_lock_irqsave(rq, &rf);
+       if (on)
+               rq->balance_flags |= BALANCE_PUSH;
+       else
+               rq->balance_flags &= ~BALANCE_PUSH;
+       rq_unlock_irqrestore(rq, &rf);
+}
 
-               /*
-                * Rules for changing task_struct::cpus_mask are holding
-                * both pi_lock and rq->lock, such that holding either
-                * stabilizes the mask.
-                *
-                * Drop rq->lock is not quite as disastrous as it usually is
-                * because !cpu_active at this point, which means load-balance
-                * will not interfere. Also, stop-machine.
-                */
-               rq_unlock(rq, rf);
-               raw_spin_lock(&next->pi_lock);
-               rq_relock(rq, rf);
+/*
+ * Invoked from a CPUs hotplug control thread after the CPU has been marked
+ * inactive. All tasks which are not per CPU kernel threads are either
+ * pushed off this CPU now via balance_push() or placed on a different CPU
+ * during wakeup. Wait until the CPU is quiescent.
+ */
+static void balance_hotplug_wait(void)
+{
+       struct rq *rq = this_rq();
 
-               /*
-                * Since we're inside stop-machine, _nothing_ should have
-                * changed the task, WARN if weird stuff happened, because in
-                * that case the above rq->lock drop is a fail too.
-                */
-               if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
-                       raw_spin_unlock(&next->pi_lock);
-                       continue;
-               }
+       rcuwait_wait_event(&rq->hotplug_wait,
+                          rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
+                          TASK_UNINTERRUPTIBLE);
+}
 
-               /* Find suitable destination for @next, with force if needed. */
-               dest_cpu = select_fallback_rq(dead_rq->cpu, next);
-               rq = __migrate_task(rq, rf, next, dest_cpu);
-               if (rq != dead_rq) {
-                       rq_unlock(rq, rf);
-                       rq = dead_rq;
-                       *rf = orf;
-                       rq_relock(rq, rf);
-               }
-               raw_spin_unlock(&next->pi_lock);
-       }
+#else
 
-       rq->stop = stop;
+static inline void balance_push(struct rq *rq)
+{
 }
+
+static inline void balance_push_set(int cpu, bool on)
+{
+}
+
+static inline void balance_hotplug_wait(void)
+{
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void set_rq_online(struct rq *rq)
@@ -6892,6 +7425,8 @@ int sched_cpu_activate(unsigned int cpu)
        struct rq *rq = cpu_rq(cpu);
        struct rq_flags rf;
 
+       balance_push_set(cpu, false);
+
 #ifdef CONFIG_SCHED_SMT
        /*
         * When going up, increment the number of cores with SMT present.
@@ -6927,6 +7462,8 @@ int sched_cpu_activate(unsigned int cpu)
 
 int sched_cpu_deactivate(unsigned int cpu)
 {
+       struct rq *rq = cpu_rq(cpu);
+       struct rq_flags rf;
        int ret;
 
        set_cpu_active(cpu, false);
@@ -6939,6 +7476,16 @@ int sched_cpu_deactivate(unsigned int cpu)
         */
        synchronize_rcu();
 
+       balance_push_set(cpu, true);
+
+       rq_lock_irqsave(rq, &rf);
+       if (rq->rd) {
+               update_rq_clock(rq);
+               BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+               set_rq_offline(rq);
+       }
+       rq_unlock_irqrestore(rq, &rf);
+
 #ifdef CONFIG_SCHED_SMT
        /*
         * When going down, decrement the number of cores with SMT present.
@@ -6952,6 +7499,7 @@ int sched_cpu_deactivate(unsigned int cpu)
 
        ret = cpuset_cpu_inactive(cpu);
        if (ret) {
+               balance_push_set(cpu, false);
                set_cpu_active(cpu, true);
                return ret;
        }
@@ -6975,6 +7523,41 @@ int sched_cpu_starting(unsigned int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Invoked immediately before the stopper thread is invoked to bring the
+ * CPU down completely. At this point all per CPU kthreads except the
+ * hotplug thread (current) and the stopper thread (inactive) have been
+ * either parked or have been unbound from the outgoing CPU. Ensure that
+ * any of those which might be on the way out are gone.
+ *
+ * If after this point a bound task is being woken on this CPU then the
+ * responsible hotplug callback has failed to do it's job.
+ * sched_cpu_dying() will catch it with the appropriate fireworks.
+ */
+int sched_cpu_wait_empty(unsigned int cpu)
+{
+       balance_hotplug_wait();
+       return 0;
+}
+
+/*
+ * Since this CPU is going 'away' for a while, fold any nr_active delta we
+ * might have. Called from the CPU stopper task after ensuring that the
+ * stopper is the last running task on the CPU, so nr_active count is
+ * stable. We need to take the teardown thread which is calling this into
+ * account, so we hand in adjust = 1 to the load calculation.
+ *
+ * Also see the comment "Global load-average calculations".
+ */
+static void calc_load_migrate(struct rq *rq)
+{
+       long delta = calc_load_fold_active(rq, 1);
+
+       if (delta)
+               atomic_long_add(delta, &calc_load_tasks);
+}
+
 int sched_cpu_dying(unsigned int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -6984,12 +7567,7 @@ int sched_cpu_dying(unsigned int cpu)
        sched_tick_stop(cpu);
 
        rq_lock_irqsave(rq, &rf);
-       if (rq->rd) {
-               BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-               set_rq_offline(rq);
-       }
-       migrate_tasks(rq, &rf);
-       BUG_ON(rq->nr_running != 1);
+       BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
        rq_unlock_irqrestore(rq, &rf);
 
        calc_load_migrate(rq);
@@ -7194,7 +7772,10 @@ void __init sched_init(void)
                rq->last_blocked_load_update_tick = jiffies;
                atomic_set(&rq->nohz_flags, 0);
 
-               rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
+               INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
+#endif
+#ifdef CONFIG_HOTPLUG_CPU
+               rcuwait_init(&rq->hotplug_wait);
 #endif
 #endif /* CONFIG_SMP */
                hrtick_rq_init(rq);
@@ -7333,6 +7914,39 @@ void __cant_sleep(const char *file, int line, int preempt_offset)
        add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
 EXPORT_SYMBOL_GPL(__cant_sleep);
+
+#ifdef CONFIG_SMP
+void __cant_migrate(const char *file, int line)
+{
+       static unsigned long prev_jiffy;
+
+       if (irqs_disabled())
+               return;
+
+       if (is_migration_disabled(current))
+               return;
+
+       if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+               return;
+
+       if (preempt_count() > 0)
+               return;
+
+       if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+               return;
+       prev_jiffy = jiffies;
+
+       pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
+       pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
+              in_atomic(), irqs_disabled(), is_migration_disabled(current),
+              current->pid, current->comm);
+
+       debug_show_held_locks(current);
+       dump_stack();
+       add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+EXPORT_SYMBOL_GPL(__cant_migrate);
+#endif
 #endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
@@ -7666,7 +8280,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
                        return -EINVAL;
 #endif
                /*
-                * Serialize against wake_up_new_task() such that if its
+                * Serialize against wake_up_new_task() such that if it's
                 * running, we're sure to observe its full state.
                 */
                raw_spin_lock_irq(&task->pi_lock);