sched: Prevent balance_push() on remote runqueues
[linux-2.6-microblaze.git] / kernel / sched / core.c
index 4ca80df..b21a185 100644 (file)
@@ -84,6 +84,286 @@ unsigned int sysctl_sched_rt_period = 1000000;
 
 __read_mostly int scheduler_running;
 
+#ifdef CONFIG_SCHED_CORE
+
+DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+/* kernel prio, less is more */
+static inline int __task_prio(struct task_struct *p)
+{
+       if (p->sched_class == &stop_sched_class) /* trumps deadline */
+               return -2;
+
+       if (rt_prio(p->prio)) /* includes deadline */
+               return p->prio; /* [-1, 99] */
+
+       if (p->sched_class == &idle_sched_class)
+               return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
+
+       return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
+}
+
+/*
+ * l(a,b)
+ * le(a,b) := !l(b,a)
+ * g(a,b)  := l(b,a)
+ * ge(a,b) := !l(a,b)
+ */
+
+/* real prio, less is less */
+static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+{
+
+       int pa = __task_prio(a), pb = __task_prio(b);
+
+       if (-pa < -pb)
+               return true;
+
+       if (-pb < -pa)
+               return false;
+
+       if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
+               return !dl_time_before(a->dl.deadline, b->dl.deadline);
+
+       if (pa == MAX_RT_PRIO + MAX_NICE)       /* fair */
+               return cfs_prio_less(a, b, in_fi);
+
+       return false;
+}
+
+static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
+{
+       if (a->core_cookie < b->core_cookie)
+               return true;
+
+       if (a->core_cookie > b->core_cookie)
+               return false;
+
+       /* flip prio, so high prio is leftmost */
+       if (prio_less(b, a, task_rq(a)->core->core_forceidle))
+               return true;
+
+       return false;
+}
+
+#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
+
+static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
+{
+       return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
+}
+
+static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
+{
+       const struct task_struct *p = __node_2_sc(node);
+       unsigned long cookie = (unsigned long)key;
+
+       if (cookie < p->core_cookie)
+               return -1;
+
+       if (cookie > p->core_cookie)
+               return 1;
+
+       return 0;
+}
+
+void sched_core_enqueue(struct rq *rq, struct task_struct *p)
+{
+       rq->core->core_task_seq++;
+
+       if (!p->core_cookie)
+               return;
+
+       rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
+}
+
+void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+{
+       rq->core->core_task_seq++;
+
+       if (!sched_core_enqueued(p))
+               return;
+
+       rb_erase(&p->core_node, &rq->core_tree);
+       RB_CLEAR_NODE(&p->core_node);
+}
+
+/*
+ * Find left-most (aka, highest priority) task matching @cookie.
+ */
+static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+{
+       struct rb_node *node;
+
+       node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
+       /*
+        * The idle task always matches any cookie!
+        */
+       if (!node)
+               return idle_sched_class.pick_task(rq);
+
+       return __node_2_sc(node);
+}
+
+static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
+{
+       struct rb_node *node = &p->core_node;
+
+       node = rb_next(node);
+       if (!node)
+               return NULL;
+
+       p = container_of(node, struct task_struct, core_node);
+       if (p->core_cookie != cookie)
+               return NULL;
+
+       return p;
+}
+
+/*
+ * Magic required such that:
+ *
+ *     raw_spin_rq_lock(rq);
+ *     ...
+ *     raw_spin_rq_unlock(rq);
+ *
+ * ends up locking and unlocking the _same_ lock, and all CPUs
+ * always agree on what rq has what lock.
+ *
+ * XXX entirely possible to selectively enable cores, don't bother for now.
+ */
+
+static DEFINE_MUTEX(sched_core_mutex);
+static atomic_t sched_core_count;
+static struct cpumask sched_core_mask;
+
+static void sched_core_lock(int cpu, unsigned long *flags)
+{
+       const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+       int t, i = 0;
+
+       local_irq_save(*flags);
+       for_each_cpu(t, smt_mask)
+               raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
+}
+
+static void sched_core_unlock(int cpu, unsigned long *flags)
+{
+       const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+       int t;
+
+       for_each_cpu(t, smt_mask)
+               raw_spin_unlock(&cpu_rq(t)->__lock);
+       local_irq_restore(*flags);
+}
+
+static void __sched_core_flip(bool enabled)
+{
+       unsigned long flags;
+       int cpu, t;
+
+       cpus_read_lock();
+
+       /*
+        * Toggle the online cores, one by one.
+        */
+       cpumask_copy(&sched_core_mask, cpu_online_mask);
+       for_each_cpu(cpu, &sched_core_mask) {
+               const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+
+               sched_core_lock(cpu, &flags);
+
+               for_each_cpu(t, smt_mask)
+                       cpu_rq(t)->core_enabled = enabled;
+
+               sched_core_unlock(cpu, &flags);
+
+               cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
+       }
+
+       /*
+        * Toggle the offline CPUs.
+        */
+       cpumask_copy(&sched_core_mask, cpu_possible_mask);
+       cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
+
+       for_each_cpu(cpu, &sched_core_mask)
+               cpu_rq(cpu)->core_enabled = enabled;
+
+       cpus_read_unlock();
+}
+
+static void sched_core_assert_empty(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
+}
+
+static void __sched_core_enable(void)
+{
+       static_branch_enable(&__sched_core_enabled);
+       /*
+        * Ensure all previous instances of raw_spin_rq_*lock() have finished
+        * and future ones will observe !sched_core_disabled().
+        */
+       synchronize_rcu();
+       __sched_core_flip(true);
+       sched_core_assert_empty();
+}
+
+static void __sched_core_disable(void)
+{
+       sched_core_assert_empty();
+       __sched_core_flip(false);
+       static_branch_disable(&__sched_core_enabled);
+}
+
+void sched_core_get(void)
+{
+       if (atomic_inc_not_zero(&sched_core_count))
+               return;
+
+       mutex_lock(&sched_core_mutex);
+       if (!atomic_read(&sched_core_count))
+               __sched_core_enable();
+
+       smp_mb__before_atomic();
+       atomic_inc(&sched_core_count);
+       mutex_unlock(&sched_core_mutex);
+}
+
+static void __sched_core_put(struct work_struct *work)
+{
+       if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) {
+               __sched_core_disable();
+               mutex_unlock(&sched_core_mutex);
+       }
+}
+
+void sched_core_put(void)
+{
+       static DECLARE_WORK(_work, __sched_core_put);
+
+       /*
+        * "There can be only one"
+        *
+        * Either this is the last one, or we don't actually need to do any
+        * 'work'. If it is the last *again*, we rely on
+        * WORK_STRUCT_PENDING_BIT.
+        */
+       if (!atomic_add_unless(&sched_core_count, -1, 1))
+               schedule_work(&_work);
+}
+
+#else /* !CONFIG_SCHED_CORE */
+
+static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
+static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+
+#endif /* CONFIG_SCHED_CORE */
+
 /*
  * part of the period that we allow rt tasks to run in us.
  * default: 0.95s
@@ -184,6 +464,79 @@ int sysctl_sched_rt_runtime = 950000;
  *
  */
 
+void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
+{
+       raw_spinlock_t *lock;
+
+       /* Matches synchronize_rcu() in __sched_core_enable() */
+       preempt_disable();
+       if (sched_core_disabled()) {
+               raw_spin_lock_nested(&rq->__lock, subclass);
+               /* preempt_count *MUST* be > 1 */
+               preempt_enable_no_resched();
+               return;
+       }
+
+       for (;;) {
+               lock = __rq_lockp(rq);
+               raw_spin_lock_nested(lock, subclass);
+               if (likely(lock == __rq_lockp(rq))) {
+                       /* preempt_count *MUST* be > 1 */
+                       preempt_enable_no_resched();
+                       return;
+               }
+               raw_spin_unlock(lock);
+       }
+}
+
+bool raw_spin_rq_trylock(struct rq *rq)
+{
+       raw_spinlock_t *lock;
+       bool ret;
+
+       /* Matches synchronize_rcu() in __sched_core_enable() */
+       preempt_disable();
+       if (sched_core_disabled()) {
+               ret = raw_spin_trylock(&rq->__lock);
+               preempt_enable();
+               return ret;
+       }
+
+       for (;;) {
+               lock = __rq_lockp(rq);
+               ret = raw_spin_trylock(lock);
+               if (!ret || (likely(lock == __rq_lockp(rq)))) {
+                       preempt_enable();
+                       return ret;
+               }
+               raw_spin_unlock(lock);
+       }
+}
+
+void raw_spin_rq_unlock(struct rq *rq)
+{
+       raw_spin_unlock(rq_lockp(rq));
+}
+
+#ifdef CONFIG_SMP
+/*
+ * double_rq_lock - safely lock two runqueues
+ */
+void double_rq_lock(struct rq *rq1, struct rq *rq2)
+{
+       lockdep_assert_irqs_disabled();
+
+       if (rq_order_less(rq2, rq1))
+               swap(rq1, rq2);
+
+       raw_spin_rq_lock(rq1);
+       if (__rq_lockp(rq1) == __rq_lockp(rq2))
+               return;
+
+       raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+}
+#endif
+
 /*
  * __task_rq_lock - lock the rq @p resides on.
  */
@@ -196,12 +549,12 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 
        for (;;) {
                rq = task_rq(p);
-               raw_spin_lock(&rq->lock);
+               raw_spin_rq_lock(rq);
                if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
                        rq_pin_lock(rq, rf);
                        return rq;
                }
-               raw_spin_unlock(&rq->lock);
+               raw_spin_rq_unlock(rq);
 
                while (unlikely(task_on_rq_migrating(p)))
                        cpu_relax();
@@ -220,7 +573,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
        for (;;) {
                raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
                rq = task_rq(p);
-               raw_spin_lock(&rq->lock);
+               raw_spin_rq_lock(rq);
                /*
                 *      move_queued_task()              task_rq_lock()
                 *
@@ -242,7 +595,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
                        rq_pin_lock(rq, rf);
                        return rq;
                }
-               raw_spin_unlock(&rq->lock);
+               raw_spin_rq_unlock(rq);
                raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 
                while (unlikely(task_on_rq_migrating(p)))
@@ -312,7 +665,7 @@ void update_rq_clock(struct rq *rq)
 {
        s64 delta;
 
-       lockdep_assert_held(&rq->lock);
+       lockdep_assert_rq_held(rq);
 
        if (rq->clock_update_flags & RQCF_ACT_SKIP)
                return;
@@ -585,7 +938,6 @@ void wake_up_q(struct wake_q_head *head)
                struct task_struct *task;
 
                task = container_of(node, struct task_struct, wake_q);
-               BUG_ON(!task);
                /* Task can safely be re-inserted now: */
                node = node->next;
                task->wake_q.next = NULL;
@@ -611,7 +963,7 @@ void resched_curr(struct rq *rq)
        struct task_struct *curr = rq->curr;
        int cpu;
 
-       lockdep_assert_held(&rq->lock);
+       lockdep_assert_rq_held(rq);
 
        if (test_tsk_need_resched(curr))
                return;
@@ -635,10 +987,10 @@ void resched_cpu(int cpu)
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
 
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       raw_spin_rq_lock_irqsave(rq, flags);
        if (cpu_online(cpu) || cpu == smp_processor_id())
                resched_curr(rq);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       raw_spin_rq_unlock_irqrestore(rq, flags);
 }
 
 #ifdef CONFIG_SMP
@@ -1065,9 +1417,10 @@ static void uclamp_sync_util_min_rt_default(void)
 static inline struct uclamp_se
 uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
 {
+       /* Copy by value as we could modify it */
        struct uclamp_se uc_req = p->uclamp_req[clamp_id];
 #ifdef CONFIG_UCLAMP_TASK_GROUP
-       struct uclamp_se uc_max;
+       unsigned int tg_min, tg_max, value;
 
        /*
         * Tasks in autogroups or root task group will be
@@ -1078,9 +1431,11 @@ uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
        if (task_group(p) == &root_task_group)
                return uc_req;
 
-       uc_max = task_group(p)->uclamp[clamp_id];
-       if (uc_req.value > uc_max.value || !uc_req.user_defined)
-               return uc_max;
+       tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
+       tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
+       value = uc_req.value;
+       value = clamp(value, tg_min, tg_max);
+       uclamp_se_set(&uc_req, value, false);
 #endif
 
        return uc_req;
@@ -1137,7 +1492,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
        struct uclamp_bucket *bucket;
 
-       lockdep_assert_held(&rq->lock);
+       lockdep_assert_rq_held(rq);
 
        /* Update task effective clamp */
        p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
@@ -1177,7 +1532,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
        unsigned int bkt_clamp;
        unsigned int rq_clamp;
 
-       lockdep_assert_held(&rq->lock);
+       lockdep_assert_rq_held(rq);
 
        /*
         * If sched_uclamp_used was enabled after task @p was enqueued,
@@ -1279,8 +1634,9 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
 }
 
 static inline void
-uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+uclamp_update_active(struct task_struct *p)
 {
+       enum uclamp_id clamp_id;
        struct rq_flags rf;
        struct rq *rq;
 
@@ -1300,9 +1656,11 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
         * affecting a valid clamp bucket, the next time it's enqueued,
         * it will already see the updated clamp bucket value.
         */
-       if (p->uclamp[clamp_id].active) {
-               uclamp_rq_dec_id(rq, p, clamp_id);
-               uclamp_rq_inc_id(rq, p, clamp_id);
+       for_each_clamp_id(clamp_id) {
+               if (p->uclamp[clamp_id].active) {
+                       uclamp_rq_dec_id(rq, p, clamp_id);
+                       uclamp_rq_inc_id(rq, p, clamp_id);
+               }
        }
 
        task_rq_unlock(rq, p, &rf);
@@ -1310,20 +1668,14 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
 
 #ifdef CONFIG_UCLAMP_TASK_GROUP
 static inline void
-uclamp_update_active_tasks(struct cgroup_subsys_state *css,
-                          unsigned int clamps)
+uclamp_update_active_tasks(struct cgroup_subsys_state *css)
 {
-       enum uclamp_id clamp_id;
        struct css_task_iter it;
        struct task_struct *p;
 
        css_task_iter_start(css, 0, &it);
-       while ((p = css_task_iter_next(&it))) {
-               for_each_clamp_id(clamp_id) {
-                       if ((0x1 << clamp_id) & clamps)
-                               uclamp_update_active(p, clamp_id);
-               }
-       }
+       while ((p = css_task_iter_next(&it)))
+               uclamp_update_active(p);
        css_task_iter_end(&it);
 }
 
@@ -1590,27 +1942,38 @@ static inline void uclamp_post_fork(struct task_struct *p) { }
 static inline void init_uclamp(void) { }
 #endif /* CONFIG_UCLAMP_TASK */
 
+bool sched_task_on_rq(struct task_struct *p)
+{
+       return task_on_rq_queued(p);
+}
+
 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        if (!(flags & ENQUEUE_NOCLOCK))
                update_rq_clock(rq);
 
        if (!(flags & ENQUEUE_RESTORE)) {
-               sched_info_queued(rq, p);
+               sched_info_enqueue(rq, p);
                psi_enqueue(p, flags & ENQUEUE_WAKEUP);
        }
 
        uclamp_rq_inc(rq, p);
        p->sched_class->enqueue_task(rq, p, flags);
+
+       if (sched_core_enabled(rq))
+               sched_core_enqueue(rq, p);
 }
 
 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
+       if (sched_core_enabled(rq))
+               sched_core_dequeue(rq, p);
+
        if (!(flags & DEQUEUE_NOCLOCK))
                update_rq_clock(rq);
 
        if (!(flags & DEQUEUE_SAVE)) {
-               sched_info_dequeued(rq, p);
+               sched_info_dequeue(rq, p);
                psi_dequeue(p, flags & DEQUEUE_SLEEP);
        }
 
@@ -1632,12 +1995,18 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
        dequeue_task(rq, p, flags);
 }
 
-/*
- * __normal_prio - return the priority that is based on the static prio
- */
-static inline int __normal_prio(struct task_struct *p)
+static inline int __normal_prio(int policy, int rt_prio, int nice)
 {
-       return p->static_prio;
+       int prio;
+
+       if (dl_policy(policy))
+               prio = MAX_DL_PRIO - 1;
+       else if (rt_policy(policy))
+               prio = MAX_RT_PRIO - 1 - rt_prio;
+       else
+               prio = NICE_TO_PRIO(nice);
+
+       return prio;
 }
 
 /*
@@ -1649,15 +2018,7 @@ static inline int __normal_prio(struct task_struct *p)
  */
 static inline int normal_prio(struct task_struct *p)
 {
-       int prio;
-
-       if (task_has_dl_policy(p))
-               prio = MAX_DL_PRIO-1;
-       else if (task_has_rt_policy(p))
-               prio = MAX_RT_PRIO-1 - p->rt_priority;
-       else
-               prio = __normal_prio(p);
-       return prio;
+       return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
 }
 
 /*
@@ -1850,7 +2211,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
 static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
                                   struct task_struct *p, int new_cpu)
 {
-       lockdep_assert_held(&rq->lock);
+       lockdep_assert_rq_held(rq);
 
        deactivate_task(rq, p, DEQUEUE_NOCLOCK);
        set_task_cpu(p, new_cpu);
@@ -1916,7 +2277,6 @@ static int migration_cpu_stop(void *data)
        struct migration_arg *arg = data;
        struct set_affinity_pending *pending = arg->pending;
        struct task_struct *p = arg->task;
-       int dest_cpu = arg->dest_cpu;
        struct rq *rq = this_rq();
        bool complete = false;
        struct rq_flags rf;
@@ -1954,19 +2314,15 @@ static int migration_cpu_stop(void *data)
                if (pending) {
                        p->migration_pending = NULL;
                        complete = true;
-               }
 
-               if (dest_cpu < 0) {
                        if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
                                goto out;
-
-                       dest_cpu = cpumask_any_distribute(&p->cpus_mask);
                }
 
                if (task_on_rq_queued(p))
-                       rq = __migrate_task(rq, &rf, p, dest_cpu);
+                       rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
                else
-                       p->wake_cpu = dest_cpu;
+                       p->wake_cpu = arg->dest_cpu;
 
                /*
                 * XXX __migrate_task() can fail, at which point we might end
@@ -2024,7 +2380,7 @@ int push_cpu_stop(void *arg)
        struct task_struct *p = arg;
 
        raw_spin_lock_irq(&p->pi_lock);
-       raw_spin_lock(&rq->lock);
+       raw_spin_rq_lock(rq);
 
        if (task_rq(p) != rq)
                goto out_unlock;
@@ -2054,7 +2410,7 @@ int push_cpu_stop(void *arg)
 
 out_unlock:
        rq->push_busy = false;
-       raw_spin_unlock(&rq->lock);
+       raw_spin_rq_unlock(rq);
        raw_spin_unlock_irq(&p->pi_lock);
 
        put_task_struct(p);
@@ -2107,7 +2463,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
                 * Because __kthread_bind() calls this on blocked tasks without
                 * holding rq->lock.
                 */
-               lockdep_assert_held(&rq->lock);
+               lockdep_assert_rq_held(rq);
                dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
        }
        if (running)
@@ -2249,7 +2605,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
                        init_completion(&my_pending.done);
                        my_pending.arg = (struct migration_arg) {
                                .task = p,
-                               .dest_cpu = -1,         /* any */
+                               .dest_cpu = dest_cpu,
                                .pending = &my_pending,
                        };
 
@@ -2257,6 +2613,15 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
                } else {
                        pending = p->migration_pending;
                        refcount_inc(&pending->refs);
+                       /*
+                        * Affinity has changed, but we've already installed a
+                        * pending. migration_cpu_stop() *must* see this, else
+                        * we risk a completion of the pending despite having a
+                        * task on a disallowed CPU.
+                        *
+                        * Serialized by p->pi_lock, so this is safe.
+                        */
+                       pending->arg.dest_cpu = dest_cpu;
                }
        }
        pending = p->migration_pending;
@@ -2277,7 +2642,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
                return -EINVAL;
        }
 
-       if (task_running(rq, p) || p->state == TASK_WAKING) {
+       if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
                /*
                 * MIGRATE_ENABLE gets here because 'p == current', but for
                 * anything else we cannot do is_migration_disabled(), punt
@@ -2420,19 +2785,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 #ifdef CONFIG_SCHED_DEBUG
+       unsigned int state = READ_ONCE(p->__state);
+
        /*
         * We should never call set_task_cpu() on a blocked task,
         * ttwu() will sort out the placement.
         */
-       WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
-                       !p->on_rq);
+       WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
 
        /*
         * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
         * because schedstat_wait_{start,end} rebase migrating task's wait_start
         * time relying on p->on_rq.
         */
-       WARN_ON_ONCE(p->state == TASK_RUNNING &&
+       WARN_ON_ONCE(state == TASK_RUNNING &&
                     p->sched_class == &fair_sched_class &&
                     (p->on_rq && !task_on_rq_migrating(p)));
 
@@ -2448,7 +2814,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         * task_rq_lock().
         */
        WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
-                                     lockdep_is_held(&task_rq(p)->lock)));
+                                     lockdep_is_held(__rq_lockp(task_rq(p)))));
 #endif
        /*
         * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
@@ -2604,7 +2970,7 @@ out:
  * smp_call_function() if an IPI is sent by the same process we are
  * waiting to become inactive.
  */
-unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
 {
        int running, queued;
        struct rq_flags rf;
@@ -2632,7 +2998,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                 * is actually now running somewhere else!
                 */
                while (task_running(rq, p)) {
-                       if (match_state && unlikely(p->state != match_state))
+                       if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
                                return 0;
                        cpu_relax();
                }
@@ -2647,7 +3013,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                running = task_running(rq, p);
                queued = task_on_rq_queued(p);
                ncsw = 0;
-               if (!match_state || p->state == match_state)
+               if (!match_state || READ_ONCE(p->__state) == match_state)
                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
                task_rq_unlock(rq, p, &rf);
 
@@ -2956,7 +3322,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
                           struct rq_flags *rf)
 {
        check_preempt_curr(rq, p, wake_flags);
-       p->state = TASK_RUNNING;
+       WRITE_ONCE(p->__state, TASK_RUNNING);
        trace_sched_wakeup(p);
 
 #ifdef CONFIG_SMP
@@ -2979,6 +3345,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
                if (rq->avg_idle > max)
                        rq->avg_idle = max;
 
+               rq->wake_stamp = jiffies;
+               rq->wake_avg_idle = rq->avg_idle / 2;
+
                rq->idle_stamp = 0;
        }
 #endif
@@ -2990,7 +3359,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 {
        int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
 
-       lockdep_assert_held(&rq->lock);
+       lockdep_assert_rq_held(rq);
 
        if (p->sched_contributes_to_load)
                rq->nr_uninterruptible--;
@@ -3345,12 +3714,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
                 *  - we're serialized against set_special_state() by virtue of
                 *    it disabling IRQs (this allows not taking ->pi_lock).
                 */
-               if (!(p->state & state))
+               if (!(READ_ONCE(p->__state) & state))
                        goto out;
 
                success = 1;
                trace_sched_waking(p);
-               p->state = TASK_RUNNING;
+               WRITE_ONCE(p->__state, TASK_RUNNING);
                trace_sched_wakeup(p);
                goto out;
        }
@@ -3363,7 +3732,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         */
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        smp_mb__after_spinlock();
-       if (!(p->state & state))
+       if (!(READ_ONCE(p->__state) & state))
                goto unlock;
 
        trace_sched_waking(p);
@@ -3429,7 +3798,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         * TASK_WAKING such that we can unlock p->pi_lock before doing the
         * enqueue, such as ttwu_queue_wakelist().
         */
-       p->state = TASK_WAKING;
+       WRITE_ONCE(p->__state, TASK_WAKING);
 
        /*
         * If the owning (remote) CPU is still in the middle of schedule() with
@@ -3522,7 +3891,7 @@ bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct t
                        ret = func(p, arg);
                rq_unlock(rq, &rf);
        } else {
-               switch (p->state) {
+               switch (READ_ONCE(p->__state)) {
                case TASK_RUNNING:
                case TASK_WAKING:
                        break;
@@ -3648,7 +4017,6 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 #ifdef CONFIG_SCHEDSTATS
 
 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
-static bool __initdata __sched_schedstats = false;
 
 static void set_schedstats(bool enabled)
 {
@@ -3672,16 +4040,11 @@ static int __init setup_schedstats(char *str)
        if (!str)
                goto out;
 
-       /*
-        * This code is called before jump labels have been set up, so we can't
-        * change the static branch directly just yet.  Instead set a temporary
-        * variable so init_schedstats() can do it later.
-        */
        if (!strcmp(str, "enable")) {
-               __sched_schedstats = true;
+               set_schedstats(true);
                ret = 1;
        } else if (!strcmp(str, "disable")) {
-               __sched_schedstats = false;
+               set_schedstats(false);
                ret = 1;
        }
 out:
@@ -3692,11 +4055,6 @@ out:
 }
 __setup("schedstats=", setup_schedstats);
 
-static void __init init_schedstats(void)
-{
-       set_schedstats(__sched_schedstats);
-}
-
 #ifdef CONFIG_PROC_SYSCTL
 int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
@@ -3718,8 +4076,6 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
        return err;
 }
 #endif /* CONFIG_PROC_SYSCTL */
-#else  /* !CONFIG_SCHEDSTATS */
-static inline void init_schedstats(void) {}
 #endif /* CONFIG_SCHEDSTATS */
 
 /*
@@ -3735,7 +4091,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
         * nobody will actually run it, and a signal or other external
         * event cannot wake it up and insert it on the runqueue either.
         */
-       p->state = TASK_NEW;
+       p->__state = TASK_NEW;
 
        /*
         * Make sure we do not leak PI boosting priority to the child.
@@ -3755,7 +4111,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
                } else if (PRIO_TO_NICE(p->static_prio) < 0)
                        p->static_prio = NICE_TO_PRIO(0);
 
-               p->prio = p->normal_prio = __normal_prio(p);
+               p->prio = p->normal_prio = p->static_prio;
                set_load_weight(p, false);
 
                /*
@@ -3841,7 +4197,7 @@ void wake_up_new_task(struct task_struct *p)
        struct rq *rq;
 
        raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
-       p->state = TASK_RUNNING;
+       WRITE_ONCE(p->__state, TASK_RUNNING);
 #ifdef CONFIG_SMP
        /*
         * Fork balancing, do it here and not earlier because:
@@ -4001,7 +4357,7 @@ static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
        void (*func)(struct rq *rq);
        struct callback_head *next;
 
-       lockdep_assert_held(&rq->lock);
+       lockdep_assert_rq_held(rq);
 
        while (head) {
                func = (void (*)(struct rq *))head->func;
@@ -4024,7 +4380,7 @@ static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
 {
        struct callback_head *head = rq->balance_callback;
 
-       lockdep_assert_held(&rq->lock);
+       lockdep_assert_rq_held(rq);
        if (head)
                rq->balance_callback = NULL;
 
@@ -4041,9 +4397,9 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
        unsigned long flags;
 
        if (unlikely(head)) {
-               raw_spin_lock_irqsave(&rq->lock, flags);
+               raw_spin_rq_lock_irqsave(rq, flags);
                do_balance_callbacks(rq, head);
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
+               raw_spin_rq_unlock_irqrestore(rq, flags);
        }
 }
 
@@ -4074,10 +4430,10 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf
         * do an early lockdep release here:
         */
        rq_unpin_lock(rq, rf);
-       spin_release(&rq->lock.dep_map, _THIS_IP_);
+       spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
 #ifdef CONFIG_DEBUG_SPINLOCK
        /* this is a valid case when another task releases the spinlock */
-       rq->lock.owner = next;
+       rq_lockp(rq)->owner = next;
 #endif
 }
 
@@ -4088,9 +4444,9 @@ static inline void finish_lock_switch(struct rq *rq)
         * fix up the runqueue lock - which gets 'carried over' from
         * prev into current:
         */
-       spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+       spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_);
        __balance_callbacks(rq);
-       raw_spin_unlock_irq(&rq->lock);
+       raw_spin_rq_unlock_irq(rq);
 }
 
 /*
@@ -4203,10 +4559,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
         * running on another CPU and we could rave with its RUNNING -> DEAD
         * transition, resulting in a double drop.
         */
-       prev_state = prev->state;
+       prev_state = READ_ONCE(prev->__state);
        vtime_task_switch(prev);
        perf_event_task_sched_in(prev, current);
        finish_task(prev);
+       tick_nohz_task_switch();
        finish_lock_switch(rq);
        finish_arch_post_lock_switch();
        kcov_finish_switch(current);
@@ -4252,7 +4609,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
                put_task_struct_rcu_user(prev);
        }
 
-       tick_nohz_task_switch();
        return rq;
 }
 
@@ -4348,9 +4704,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
  * externally visible scheduler statistics: current number of runnable
  * threads, total number of context switches performed since bootup.
  */
-unsigned long nr_running(void)
+unsigned int nr_running(void)
 {
-       unsigned long i, sum = 0;
+       unsigned int i, sum = 0;
 
        for_each_online_cpu(i)
                sum += cpu_rq(i)->nr_running;
@@ -4395,7 +4751,7 @@ unsigned long long nr_context_switches(void)
  * it does become runnable.
  */
 
-unsigned long nr_iowait_cpu(int cpu)
+unsigned int nr_iowait_cpu(int cpu)
 {
        return atomic_read(&cpu_rq(cpu)->nr_iowait);
 }
@@ -4430,9 +4786,9 @@ unsigned long nr_iowait_cpu(int cpu)
  * Task CPU affinities can make all that even more 'interesting'.
  */
 
-unsigned long nr_iowait(void)
+unsigned int nr_iowait(void)
 {
-       unsigned long i, sum = 0;
+       unsigned int i, sum = 0;
 
        for_each_possible_cpu(i)
                sum += nr_iowait_cpu(i);
@@ -4897,7 +5253,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
 #endif
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-       if (!preempt && prev->state && prev->non_block_count) {
+       if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) {
                printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
                        prev->comm, prev->pid, prev->non_block_count);
                dump_stack();
@@ -4943,7 +5299,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
        const struct sched_class *class;
        struct task_struct *p;
@@ -4961,7 +5317,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
                if (unlikely(p == RETRY_TASK))
                        goto restart;
 
-               /* Assumes fair_sched_class->next == idle_sched_class */
+               /* Assume the next prioritized class is idle_sched_class */
                if (!p) {
                        put_prev_task(rq, prev);
                        p = pick_next_task_idle(rq);
@@ -4983,28 +5339,551 @@ restart:
        BUG();
 }
 
+#ifdef CONFIG_SCHED_CORE
+static inline bool is_task_rq_idle(struct task_struct *t)
+{
+       return (task_rq(t)->idle == t);
+}
+
+static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
+{
+       return is_task_rq_idle(a) || (a->core_cookie == cookie);
+}
+
+static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
+{
+       if (is_task_rq_idle(a) || is_task_rq_idle(b))
+               return true;
+
+       return a->core_cookie == b->core_cookie;
+}
+
+// XXX fairness/fwd progress conditions
 /*
- * __schedule() is the main scheduler function.
- *
- * The main means of driving the scheduler and thus entering this function are:
- *
- *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
- *
- *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
- *      paths. For example, see arch/x86/entry_64.S.
- *
- *      To drive preemption between tasks, the scheduler sets the flag in timer
- *      interrupt handler scheduler_tick().
- *
- *   3. Wakeups don't really cause entry into schedule(). They add a
- *      task to the run-queue and that's it.
- *
- *      Now, if the new task added to the run-queue preempts the current
- *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
- *      called on the nearest possible occasion:
- *
- *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
- *
+ * Returns
+ * - NULL if there is no runnable task for this class.
+ * - the highest priority task for this runqueue if it matches
+ *   rq->core->core_cookie or its priority is greater than max.
+ * - Else returns idle_task.
+ */
+static struct task_struct *
+pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max, bool in_fi)
+{
+       struct task_struct *class_pick, *cookie_pick;
+       unsigned long cookie = rq->core->core_cookie;
+
+       class_pick = class->pick_task(rq);
+       if (!class_pick)
+               return NULL;
+
+       if (!cookie) {
+               /*
+                * If class_pick is tagged, return it only if it has
+                * higher priority than max.
+                */
+               if (max && class_pick->core_cookie &&
+                   prio_less(class_pick, max, in_fi))
+                       return idle_sched_class.pick_task(rq);
+
+               return class_pick;
+       }
+
+       /*
+        * If class_pick is idle or matches cookie, return early.
+        */
+       if (cookie_equals(class_pick, cookie))
+               return class_pick;
+
+       cookie_pick = sched_core_find(rq, cookie);
+
+       /*
+        * If class > max && class > cookie, it is the highest priority task on
+        * the core (so far) and it must be selected, otherwise we must go with
+        * the cookie pick in order to satisfy the constraint.
+        */
+       if (prio_less(cookie_pick, class_pick, in_fi) &&
+           (!max || prio_less(max, class_pick, in_fi)))
+               return class_pick;
+
+       return cookie_pick;
+}
+
+extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+       struct task_struct *next, *max = NULL;
+       const struct sched_class *class;
+       const struct cpumask *smt_mask;
+       bool fi_before = false;
+       int i, j, cpu, occ = 0;
+       bool need_sync;
+
+       if (!sched_core_enabled(rq))
+               return __pick_next_task(rq, prev, rf);
+
+       cpu = cpu_of(rq);
+
+       /* Stopper task is switching into idle, no need core-wide selection. */
+       if (cpu_is_offline(cpu)) {
+               /*
+                * Reset core_pick so that we don't enter the fastpath when
+                * coming online. core_pick would already be migrated to
+                * another cpu during offline.
+                */
+               rq->core_pick = NULL;
+               return __pick_next_task(rq, prev, rf);
+       }
+
+       /*
+        * If there were no {en,de}queues since we picked (IOW, the task
+        * pointers are all still valid), and we haven't scheduled the last
+        * pick yet, do so now.
+        *
+        * rq->core_pick can be NULL if no selection was made for a CPU because
+        * it was either offline or went offline during a sibling's core-wide
+        * selection. In this case, do a core-wide selection.
+        */
+       if (rq->core->core_pick_seq == rq->core->core_task_seq &&
+           rq->core->core_pick_seq != rq->core_sched_seq &&
+           rq->core_pick) {
+               WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
+
+               next = rq->core_pick;
+               if (next != prev) {
+                       put_prev_task(rq, prev);
+                       set_next_task(rq, next);
+               }
+
+               rq->core_pick = NULL;
+               return next;
+       }
+
+       put_prev_task_balance(rq, prev, rf);
+
+       smt_mask = cpu_smt_mask(cpu);
+       need_sync = !!rq->core->core_cookie;
+
+       /* reset state */
+       rq->core->core_cookie = 0UL;
+       if (rq->core->core_forceidle) {
+               need_sync = true;
+               fi_before = true;
+               rq->core->core_forceidle = false;
+       }
+
+       /*
+        * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq
+        *
+        * @task_seq guards the task state ({en,de}queues)
+        * @pick_seq is the @task_seq we did a selection on
+        * @sched_seq is the @pick_seq we scheduled
+        *
+        * However, preemptions can cause multiple picks on the same task set.
+        * 'Fix' this by also increasing @task_seq for every pick.
+        */
+       rq->core->core_task_seq++;
+
+       /*
+        * Optimize for common case where this CPU has no cookies
+        * and there are no cookied tasks running on siblings.
+        */
+       if (!need_sync) {
+               for_each_class(class) {
+                       next = class->pick_task(rq);
+                       if (next)
+                               break;
+               }
+
+               if (!next->core_cookie) {
+                       rq->core_pick = NULL;
+                       /*
+                        * For robustness, update the min_vruntime_fi for
+                        * unconstrained picks as well.
+                        */
+                       WARN_ON_ONCE(fi_before);
+                       task_vruntime_update(rq, next, false);
+                       goto done;
+               }
+       }
+
+       for_each_cpu(i, smt_mask) {
+               struct rq *rq_i = cpu_rq(i);
+
+               rq_i->core_pick = NULL;
+
+               if (i != cpu)
+                       update_rq_clock(rq_i);
+       }
+
+       /*
+        * Try and select tasks for each sibling in descending sched_class
+        * order.
+        */
+       for_each_class(class) {
+again:
+               for_each_cpu_wrap(i, smt_mask, cpu) {
+                       struct rq *rq_i = cpu_rq(i);
+                       struct task_struct *p;
+
+                       if (rq_i->core_pick)
+                               continue;
+
+                       /*
+                        * If this sibling doesn't yet have a suitable task to
+                        * run; ask for the most eligible task, given the
+                        * highest priority task already selected for this
+                        * core.
+                        */
+                       p = pick_task(rq_i, class, max, fi_before);
+                       if (!p)
+                               continue;
+
+                       if (!is_task_rq_idle(p))
+                               occ++;
+
+                       rq_i->core_pick = p;
+                       if (rq_i->idle == p && rq_i->nr_running) {
+                               rq->core->core_forceidle = true;
+                               if (!fi_before)
+                                       rq->core->core_forceidle_seq++;
+                       }
+
+                       /*
+                        * If this new candidate is of higher priority than the
+                        * previous; and they're incompatible; we need to wipe
+                        * the slate and start over. pick_task makes sure that
+                        * p's priority is more than max if it doesn't match
+                        * max's cookie.
+                        *
+                        * NOTE: this is a linear max-filter and is thus bounded
+                        * in execution time.
+                        */
+                       if (!max || !cookie_match(max, p)) {
+                               struct task_struct *old_max = max;
+
+                               rq->core->core_cookie = p->core_cookie;
+                               max = p;
+
+                               if (old_max) {
+                                       rq->core->core_forceidle = false;
+                                       for_each_cpu(j, smt_mask) {
+                                               if (j == i)
+                                                       continue;
+
+                                               cpu_rq(j)->core_pick = NULL;
+                                       }
+                                       occ = 1;
+                                       goto again;
+                               }
+                       }
+               }
+       }
+
+       rq->core->core_pick_seq = rq->core->core_task_seq;
+       next = rq->core_pick;
+       rq->core_sched_seq = rq->core->core_pick_seq;
+
+       /* Something should have been selected for current CPU */
+       WARN_ON_ONCE(!next);
+
+       /*
+        * Reschedule siblings
+        *
+        * NOTE: L1TF -- at this point we're no longer running the old task and
+        * sending an IPI (below) ensures the sibling will no longer be running
+        * their task. This ensures there is no inter-sibling overlap between
+        * non-matching user state.
+        */
+       for_each_cpu(i, smt_mask) {
+               struct rq *rq_i = cpu_rq(i);
+
+               /*
+                * An online sibling might have gone offline before a task
+                * could be picked for it, or it might be offline but later
+                * happen to come online, but its too late and nothing was
+                * picked for it.  That's Ok - it will pick tasks for itself,
+                * so ignore it.
+                */
+               if (!rq_i->core_pick)
+                       continue;
+
+               /*
+                * Update for new !FI->FI transitions, or if continuing to be in !FI:
+                * fi_before     fi      update?
+                *  0            0       1
+                *  0            1       1
+                *  1            0       1
+                *  1            1       0
+                */
+               if (!(fi_before && rq->core->core_forceidle))
+                       task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
+
+               rq_i->core_pick->core_occupation = occ;
+
+               if (i == cpu) {
+                       rq_i->core_pick = NULL;
+                       continue;
+               }
+
+               /* Did we break L1TF mitigation requirements? */
+               WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
+
+               if (rq_i->curr == rq_i->core_pick) {
+                       rq_i->core_pick = NULL;
+                       continue;
+               }
+
+               resched_curr(rq_i);
+       }
+
+done:
+       set_next_task(rq, next);
+       return next;
+}
+
+static bool try_steal_cookie(int this, int that)
+{
+       struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
+       struct task_struct *p;
+       unsigned long cookie;
+       bool success = false;
+
+       local_irq_disable();
+       double_rq_lock(dst, src);
+
+       cookie = dst->core->core_cookie;
+       if (!cookie)
+               goto unlock;
+
+       if (dst->curr != dst->idle)
+               goto unlock;
+
+       p = sched_core_find(src, cookie);
+       if (p == src->idle)
+               goto unlock;
+
+       do {
+               if (p == src->core_pick || p == src->curr)
+                       goto next;
+
+               if (!cpumask_test_cpu(this, &p->cpus_mask))
+                       goto next;
+
+               if (p->core_occupation > dst->idle->core_occupation)
+                       goto next;
+
+               p->on_rq = TASK_ON_RQ_MIGRATING;
+               deactivate_task(src, p, 0);
+               set_task_cpu(p, this);
+               activate_task(dst, p, 0);
+               p->on_rq = TASK_ON_RQ_QUEUED;
+
+               resched_curr(dst);
+
+               success = true;
+               break;
+
+next:
+               p = sched_core_next(p, cookie);
+       } while (p);
+
+unlock:
+       double_rq_unlock(dst, src);
+       local_irq_enable();
+
+       return success;
+}
+
+static bool steal_cookie_task(int cpu, struct sched_domain *sd)
+{
+       int i;
+
+       for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
+               if (i == cpu)
+                       continue;
+
+               if (need_resched())
+                       break;
+
+               if (try_steal_cookie(cpu, i))
+                       return true;
+       }
+
+       return false;
+}
+
+static void sched_core_balance(struct rq *rq)
+{
+       struct sched_domain *sd;
+       int cpu = cpu_of(rq);
+
+       preempt_disable();
+       rcu_read_lock();
+       raw_spin_rq_unlock_irq(rq);
+       for_each_domain(cpu, sd) {
+               if (need_resched())
+                       break;
+
+               if (steal_cookie_task(cpu, sd))
+                       break;
+       }
+       raw_spin_rq_lock_irq(rq);
+       rcu_read_unlock();
+       preempt_enable();
+}
+
+static DEFINE_PER_CPU(struct callback_head, core_balance_head);
+
+void queue_core_balance(struct rq *rq)
+{
+       if (!sched_core_enabled(rq))
+               return;
+
+       if (!rq->core->core_cookie)
+               return;
+
+       if (!rq->nr_running) /* not forced idle */
+               return;
+
+       queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
+}
+
+static void sched_core_cpu_starting(unsigned int cpu)
+{
+       const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+       struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+       unsigned long flags;
+       int t;
+
+       sched_core_lock(cpu, &flags);
+
+       WARN_ON_ONCE(rq->core != rq);
+
+       /* if we're the first, we'll be our own leader */
+       if (cpumask_weight(smt_mask) == 1)
+               goto unlock;
+
+       /* find the leader */
+       for_each_cpu(t, smt_mask) {
+               if (t == cpu)
+                       continue;
+               rq = cpu_rq(t);
+               if (rq->core == rq) {
+                       core_rq = rq;
+                       break;
+               }
+       }
+
+       if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
+               goto unlock;
+
+       /* install and validate core_rq */
+       for_each_cpu(t, smt_mask) {
+               rq = cpu_rq(t);
+
+               if (t == cpu)
+                       rq->core = core_rq;
+
+               WARN_ON_ONCE(rq->core != core_rq);
+       }
+
+unlock:
+       sched_core_unlock(cpu, &flags);
+}
+
+static void sched_core_cpu_deactivate(unsigned int cpu)
+{
+       const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+       struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+       unsigned long flags;
+       int t;
+
+       sched_core_lock(cpu, &flags);
+
+       /* if we're the last man standing, nothing to do */
+       if (cpumask_weight(smt_mask) == 1) {
+               WARN_ON_ONCE(rq->core != rq);
+               goto unlock;
+       }
+
+       /* if we're not the leader, nothing to do */
+       if (rq->core != rq)
+               goto unlock;
+
+       /* find a new leader */
+       for_each_cpu(t, smt_mask) {
+               if (t == cpu)
+                       continue;
+               core_rq = cpu_rq(t);
+               break;
+       }
+
+       if (WARN_ON_ONCE(!core_rq)) /* impossible */
+               goto unlock;
+
+       /* copy the shared state to the new leader */
+       core_rq->core_task_seq      = rq->core_task_seq;
+       core_rq->core_pick_seq      = rq->core_pick_seq;
+       core_rq->core_cookie        = rq->core_cookie;
+       core_rq->core_forceidle     = rq->core_forceidle;
+       core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+
+       /* install new leader */
+       for_each_cpu(t, smt_mask) {
+               rq = cpu_rq(t);
+               rq->core = core_rq;
+       }
+
+unlock:
+       sched_core_unlock(cpu, &flags);
+}
+
+static inline void sched_core_cpu_dying(unsigned int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+
+       if (rq->core != rq)
+               rq->core = rq;
+}
+
+#else /* !CONFIG_SCHED_CORE */
+
+static inline void sched_core_cpu_starting(unsigned int cpu) {}
+static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
+static inline void sched_core_cpu_dying(unsigned int cpu) {}
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+       return __pick_next_task(rq, prev, rf);
+}
+
+#endif /* CONFIG_SCHED_CORE */
+
+/*
+ * __schedule() is the main scheduler function.
+ *
+ * The main means of driving the scheduler and thus entering this function are:
+ *
+ *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
+ *
+ *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
+ *      paths. For example, see arch/x86/entry_64.S.
+ *
+ *      To drive preemption between tasks, the scheduler sets the flag in timer
+ *      interrupt handler scheduler_tick().
+ *
+ *   3. Wakeups don't really cause entry into schedule(). They add a
+ *      task to the run-queue and that's it.
+ *
+ *      Now, if the new task added to the run-queue preempts the current
+ *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
+ *      called on the nearest possible occasion:
+ *
+ *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
+ *
  *         - in syscall or exception context, at the next outmost
  *           preempt_enable(). (this might be as soon as the wake_up()'s
  *           spin_unlock()!)
@@ -5074,10 +5953,10 @@ static void __sched notrace __schedule(bool preempt)
         *  - we form a control dependency vs deactivate_task() below.
         *  - ptrace_{,un}freeze_traced() can change ->state underneath us.
         */
-       prev_state = prev->state;
+       prev_state = READ_ONCE(prev->__state);
        if (!preempt && prev_state) {
                if (signal_pending_state(prev_state, prev)) {
-                       prev->state = TASK_RUNNING;
+                       WRITE_ONCE(prev->__state, TASK_RUNNING);
                } else {
                        prev->sched_contributes_to_load =
                                (prev_state & TASK_UNINTERRUPTIBLE) &&
@@ -5150,7 +6029,7 @@ static void __sched notrace __schedule(bool preempt)
 
                rq_unpin_lock(rq, &rf);
                __balance_callbacks(rq);
-               raw_spin_unlock_irq(&rq->lock);
+               raw_spin_rq_unlock_irq(rq);
        }
 }
 
@@ -5174,7 +6053,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
 {
        unsigned int task_flags;
 
-       if (!tsk->state)
+       if (task_is_running(tsk))
                return;
 
        task_flags = tsk->flags;
@@ -5249,7 +6128,7 @@ void __sched schedule_idle(void)
         * current task can be in any other state. Note, idle is always in the
         * TASK_RUNNING state.
         */
-       WARN_ON_ONCE(current->state);
+       WARN_ON_ONCE(current->__state);
        do {
                __schedule(false);
        } while (need_resched());
@@ -5548,6 +6427,18 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
 }
 EXPORT_SYMBOL(default_wake_function);
 
+static void __setscheduler_prio(struct task_struct *p, int prio)
+{
+       if (dl_prio(prio))
+               p->sched_class = &dl_sched_class;
+       else if (rt_prio(prio))
+               p->sched_class = &rt_sched_class;
+       else
+               p->sched_class = &fair_sched_class;
+
+       p->prio = prio;
+}
+
 #ifdef CONFIG_RT_MUTEXES
 
 static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
@@ -5663,22 +6554,19 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
                } else {
                        p->dl.pi_se = &p->dl;
                }
-               p->sched_class = &dl_sched_class;
        } else if (rt_prio(prio)) {
                if (dl_prio(oldprio))
                        p->dl.pi_se = &p->dl;
                if (oldprio < prio)
                        queue_flag |= ENQUEUE_HEAD;
-               p->sched_class = &rt_sched_class;
        } else {
                if (dl_prio(oldprio))
                        p->dl.pi_se = &p->dl;
                if (rt_prio(oldprio))
                        p->rt.timeout = 0;
-               p->sched_class = &fair_sched_class;
        }
 
-       p->prio = prio;
+       __setscheduler_prio(p, prio);
 
        if (queued)
                enqueue_task(rq, p, queue_flag);
@@ -5692,7 +6580,7 @@ out_unlock:
 
        rq_unpin_lock(rq, &rf);
        __balance_callbacks(rq);
-       raw_spin_unlock(&rq->lock);
+       raw_spin_rq_unlock(rq);
 
        preempt_enable();
 }
@@ -6031,35 +6919,6 @@ static void __setscheduler_params(struct task_struct *p,
        set_load_weight(p, true);
 }
 
-/* Actually do priority change: must hold pi & rq lock. */
-static void __setscheduler(struct rq *rq, struct task_struct *p,
-                          const struct sched_attr *attr, bool keep_boost)
-{
-       /*
-        * If params can't change scheduling class changes aren't allowed
-        * either.
-        */
-       if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
-               return;
-
-       __setscheduler_params(p, attr);
-
-       /*
-        * Keep a potential priority boosting if called from
-        * sched_setscheduler().
-        */
-       p->prio = normal_prio(p);
-       if (keep_boost)
-               p->prio = rt_effective_prio(p, p->prio);
-
-       if (dl_prio(p->prio))
-               p->sched_class = &dl_sched_class;
-       else if (rt_prio(p->prio))
-               p->sched_class = &rt_sched_class;
-       else
-               p->sched_class = &fair_sched_class;
-}
-
 /*
  * Check the target process has a UID that matches the current process's:
  */
@@ -6080,10 +6939,8 @@ static int __sched_setscheduler(struct task_struct *p,
                                const struct sched_attr *attr,
                                bool user, bool pi)
 {
-       int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
-                     MAX_RT_PRIO - 1 - attr->sched_priority;
-       int retval, oldprio, oldpolicy = -1, queued, running;
-       int new_effective_prio, policy = attr->sched_policy;
+       int oldpolicy = -1, policy = attr->sched_policy;
+       int retval, oldprio, newprio, queued, running;
        const struct sched_class *prev_class;
        struct callback_head *head;
        struct rq_flags rf;
@@ -6281,6 +7138,7 @@ change:
        p->sched_reset_on_fork = reset_on_fork;
        oldprio = p->prio;
 
+       newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
        if (pi) {
                /*
                 * Take priority boosted tasks into account. If the new
@@ -6289,8 +7147,8 @@ change:
                 * the runqueue. This will be done when the task deboost
                 * itself.
                 */
-               new_effective_prio = rt_effective_prio(p, newprio);
-               if (new_effective_prio == oldprio)
+               newprio = rt_effective_prio(p, newprio);
+               if (newprio == oldprio)
                        queue_flags &= ~DEQUEUE_MOVE;
        }
 
@@ -6303,7 +7161,10 @@ change:
 
        prev_class = p->sched_class;
 
-       __setscheduler(rq, p, attr, pi);
+       if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+               __setscheduler_params(p, attr);
+               __setscheduler_prio(p, newprio);
+       }
        __setscheduler_uclamp(p, attr);
 
        if (queued) {
@@ -6389,6 +7250,7 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
 {
        return __sched_setscheduler(p, attr, false, true);
 }
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
 
 /**
  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
@@ -7148,7 +8010,7 @@ again:
        if (curr->sched_class != p->sched_class)
                goto out_unlock;
 
-       if (task_running(p_rq, p) || p->state)
+       if (task_running(p_rq, p) || !task_is_running(p))
                goto out_unlock;
 
        yielded = curr->sched_class->yield_to_task(rq, p);
@@ -7351,7 +8213,7 @@ void sched_show_task(struct task_struct *p)
 
        pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
 
-       if (p->state == TASK_RUNNING)
+       if (task_is_running(p))
                pr_cont("  running task    ");
 #ifdef CONFIG_DEBUG_STACK_USAGE
        free = stack_not_used(p);
@@ -7375,26 +8237,28 @@ EXPORT_SYMBOL_GPL(sched_show_task);
 static inline bool
 state_filter_match(unsigned long state_filter, struct task_struct *p)
 {
+       unsigned int state = READ_ONCE(p->__state);
+
        /* no filter, everything matches */
        if (!state_filter)
                return true;
 
        /* filter, but doesn't match */
-       if (!(p->state & state_filter))
+       if (!(state & state_filter))
                return false;
 
        /*
         * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
         * TASK_KILLABLE).
         */
-       if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
+       if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE)
                return false;
 
        return true;
 }
 
 
-void show_state_filter(unsigned long state_filter)
+void show_state_filter(unsigned int state_filter)
 {
        struct task_struct *g, *p;
 
@@ -7433,19 +8297,32 @@ void show_state_filter(unsigned long state_filter)
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
-void init_idle(struct task_struct *idle, int cpu)
+void __init init_idle(struct task_struct *idle, int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
 
        __sched_fork(0, idle);
 
+       /*
+        * The idle task doesn't need the kthread struct to function, but it
+        * is dressed up as a per-CPU kthread and thus needs to play the part
+        * if we want to avoid special-casing it in code that deals with per-CPU
+        * kthreads.
+        */
+       set_kthread_struct(idle);
+
        raw_spin_lock_irqsave(&idle->pi_lock, flags);
-       raw_spin_lock(&rq->lock);
+       raw_spin_rq_lock(rq);
 
-       idle->state = TASK_RUNNING;
+       idle->__state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
-       idle->flags |= PF_IDLE;
+       /*
+        * PF_KTHREAD should already be set at this point; regardless, make it
+        * look like a proper per-CPU kthread.
+        */
+       idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY;
+       kthread_set_per_cpu(idle, cpu);
 
        scs_task_reset(idle);
        kasan_unpoison_task_stack(idle);
@@ -7479,7 +8356,7 @@ void init_idle(struct task_struct *idle, int cpu)
 #ifdef CONFIG_SMP
        idle->on_cpu = 1;
 #endif
-       raw_spin_unlock(&rq->lock);
+       raw_spin_rq_unlock(rq);
        raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
 
        /* Set the preempt count _outside_ the spinlocks! */
@@ -7645,8 +8522,7 @@ static void balance_push(struct rq *rq)
 {
        struct task_struct *push_task = rq->curr;
 
-       lockdep_assert_held(&rq->lock);
-       SCHED_WARN_ON(rq->cpu != smp_processor_id());
+       lockdep_assert_rq_held(rq);
 
        /*
         * Ensure the thing is persistent until balance_push_set(.on = false);
@@ -7654,20 +8530,17 @@ static void balance_push(struct rq *rq)
        rq->balance_callback = &balance_push_callback;
 
        /*
-        * Only active while going offline.
+        * Only active while going offline and when invoked on the outgoing
+        * CPU.
         */
-       if (!cpu_dying(rq->cpu))
+       if (!cpu_dying(rq->cpu) || rq != this_rq())
                return;
 
        /*
         * Both the cpu-hotplug and stop task are in this case and are
         * required to complete the hotplug process.
-        *
-        * XXX: the idle task does not match kthread_is_per_cpu() due to
-        * histerical raisins.
         */
-       if (rq->idle == push_task ||
-           kthread_is_per_cpu(push_task) ||
+       if (kthread_is_per_cpu(push_task) ||
            is_migration_disabled(push_task)) {
 
                /*
@@ -7683,9 +8556,9 @@ static void balance_push(struct rq *rq)
                 */
                if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
                    rcuwait_active(&rq->hotplug_wait)) {
-                       raw_spin_unlock(&rq->lock);
+                       raw_spin_rq_unlock(rq);
                        rcuwait_wake_up(&rq->hotplug_wait);
-                       raw_spin_lock(&rq->lock);
+                       raw_spin_rq_lock(rq);
                }
                return;
        }
@@ -7695,7 +8568,7 @@ static void balance_push(struct rq *rq)
         * Temporarily drop rq->lock such that we can wake-up the stop task.
         * Both preemption and IRQs are still disabled.
         */
-       raw_spin_unlock(&rq->lock);
+       raw_spin_rq_unlock(rq);
        stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
                            this_cpu_ptr(&push_work));
        /*
@@ -7703,7 +8576,7 @@ static void balance_push(struct rq *rq)
         * schedule(). The next pick is obviously going to be the stop task
         * which kthread_is_per_cpu() and will push this task away.
         */
-       raw_spin_lock(&rq->lock);
+       raw_spin_rq_lock(rq);
 }
 
 static void balance_push_set(int cpu, bool on)
@@ -7922,6 +8795,8 @@ int sched_cpu_deactivate(unsigned int cpu)
         */
        if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
                static_branch_dec_cpuslocked(&sched_smt_present);
+
+       sched_core_cpu_deactivate(cpu);
 #endif
 
        if (!sched_smp_initialized)
@@ -7947,6 +8822,7 @@ static void sched_rq_cpu_starting(unsigned int cpu)
 
 int sched_cpu_starting(unsigned int cpu)
 {
+       sched_core_cpu_starting(cpu);
        sched_rq_cpu_starting(cpu);
        sched_tick_start(cpu);
        return 0;
@@ -7993,7 +8869,7 @@ static void dump_rq_tasks(struct rq *rq, const char *loglvl)
        struct task_struct *g, *p;
        int cpu = cpu_of(rq);
 
-       lockdep_assert_held(&rq->lock);
+       lockdep_assert_rq_held(rq);
 
        printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
        for_each_process_thread(g, p) {
@@ -8025,6 +8901,7 @@ int sched_cpu_dying(unsigned int cpu)
        calc_load_migrate(rq);
        update_max_interval();
        hrtick_clear(rq);
+       sched_core_cpu_dying(cpu);
        return 0;
 }
 #endif
@@ -8045,6 +8922,7 @@ void __init sched_init_smp(void)
        /* Move init over to a non-isolated CPU */
        if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
                BUG();
+       current->flags &= ~PF_NO_SETAFFINITY;
        sched_init_granularity();
 
        init_sched_rt_class();
@@ -8166,7 +9044,7 @@ void __init sched_init(void)
                struct rq *rq;
 
                rq = cpu_rq(i);
-               raw_spin_lock_init(&rq->lock);
+               raw_spin_lock_init(&rq->__lock);
                rq->nr_running = 0;
                rq->calc_load_active = 0;
                rq->calc_load_update = jiffies + LOAD_FREQ;
@@ -8214,6 +9092,8 @@ void __init sched_init(void)
                rq->online = 0;
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
+               rq->wake_stamp = jiffies;
+               rq->wake_avg_idle = rq->avg_idle;
                rq->max_idle_balance_cost = sysctl_sched_migration_cost;
 
                INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -8231,6 +9111,16 @@ void __init sched_init(void)
 #endif /* CONFIG_SMP */
                hrtick_rq_init(rq);
                atomic_set(&rq->nr_iowait, 0);
+
+#ifdef CONFIG_SCHED_CORE
+               rq->core = rq;
+               rq->core_pick = NULL;
+               rq->core_enabled = 0;
+               rq->core_tree = RB_ROOT;
+               rq->core_forceidle = false;
+
+               rq->core_cookie = 0UL;
+#endif
        }
 
        set_load_weight(&init_task, false);
@@ -8257,8 +9147,6 @@ void __init sched_init(void)
 #endif
        init_sched_fair_class();
 
-       init_schedstats();
-
        psi_init();
 
        init_uclamp();
@@ -8276,15 +9164,15 @@ static inline int preempt_count_equals(int preempt_offset)
 
 void __might_sleep(const char *file, int line, int preempt_offset)
 {
+       unsigned int state = get_current_state();
        /*
         * Blocking primitives will set (and therefore destroy) current->state,
         * since we will exit with TASK_RUNNING make sure we enter with it,
         * otherwise we will destroy state.
         */
-       WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
+       WARN_ONCE(state != TASK_RUNNING && current->task_state_change,
                        "do not call blocking ops when !TASK_RUNNING; "
-                       "state=%lx set at [<%p>] %pS\n",
-                       current->state,
+                       "state=%x set at [<%p>] %pS\n", state,
                        (void *)current->task_state_change,
                        (void *)current->task_state_change);
 
@@ -8680,7 +9568,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 
 #ifdef CONFIG_UCLAMP_TASK_GROUP
        /* Propagate the effective uclamp value for the new group */
+       mutex_lock(&uclamp_mutex);
+       rcu_read_lock();
        cpu_util_update_eff(css);
+       rcu_read_unlock();
+       mutex_unlock(&uclamp_mutex);
 #endif
 
        return 0;
@@ -8741,7 +9633,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
                 * has happened. This would lead to problems with PELT, due to
                 * move wanting to detach+attach while we're not attached yet.
                 */
-               if (task->state == TASK_NEW)
+               if (READ_ONCE(task->__state) == TASK_NEW)
                        ret = -EINVAL;
                raw_spin_unlock_irq(&task->pi_lock);
 
@@ -8770,6 +9662,9 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
        enum uclamp_id clamp_id;
        unsigned int clamps;
 
+       lockdep_assert_held(&uclamp_mutex);
+       SCHED_WARN_ON(!rcu_read_lock_held());
+
        css_for_each_descendant_pre(css, top_css) {
                uc_parent = css_tg(css)->parent
                        ? css_tg(css)->parent->uclamp : NULL;
@@ -8802,7 +9697,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
                }
 
                /* Immediately update descendants RUNNABLE tasks */
-               uclamp_update_active_tasks(css, clamps);
+               uclamp_update_active_tasks(css);
        }
 }
 
@@ -8961,7 +9856,8 @@ static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
 
 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
 
-static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
+                               u64 burst)
 {
        int i, ret = 0, runtime_enabled, runtime_was_enabled;
        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@ -8991,6 +9887,10 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
        if (quota != RUNTIME_INF && quota > max_cfs_runtime)
                return -EINVAL;
 
+       if (quota != RUNTIME_INF && (burst > quota ||
+                                    burst + quota > max_cfs_runtime))
+               return -EINVAL;
+
        /*
         * Prevent race between setting of cfs_rq->runtime_enabled and
         * unthrottle_offline_cfs_rqs().
@@ -9012,6 +9912,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
        raw_spin_lock_irq(&cfs_b->lock);
        cfs_b->period = ns_to_ktime(period);
        cfs_b->quota = quota;
+       cfs_b->burst = burst;
 
        __refill_cfs_bandwidth_runtime(cfs_b);
 
@@ -9045,9 +9946,10 @@ out_unlock:
 
 static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
 {
-       u64 quota, period;
+       u64 quota, period, burst;
 
        period = ktime_to_ns(tg->cfs_bandwidth.period);
+       burst = tg->cfs_bandwidth.burst;
        if (cfs_quota_us < 0)
                quota = RUNTIME_INF;
        else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
@@ -9055,7 +9957,7 @@ static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
        else
                return -EINVAL;
 
-       return tg_set_cfs_bandwidth(tg, period, quota);
+       return tg_set_cfs_bandwidth(tg, period, quota, burst);
 }
 
 static long tg_get_cfs_quota(struct task_group *tg)
@@ -9073,15 +9975,16 @@ static long tg_get_cfs_quota(struct task_group *tg)
 
 static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
 {
-       u64 quota, period;
+       u64 quota, period, burst;
 
        if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
                return -EINVAL;
 
        period = (u64)cfs_period_us * NSEC_PER_USEC;
        quota = tg->cfs_bandwidth.quota;
+       burst = tg->cfs_bandwidth.burst;
 
-       return tg_set_cfs_bandwidth(tg, period, quota);
+       return tg_set_cfs_bandwidth(tg, period, quota, burst);
 }
 
 static long tg_get_cfs_period(struct task_group *tg)
@@ -9094,6 +9997,30 @@ static long tg_get_cfs_period(struct task_group *tg)
        return cfs_period_us;
 }
 
+static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us)
+{
+       u64 quota, period, burst;
+
+       if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)
+               return -EINVAL;
+
+       burst = (u64)cfs_burst_us * NSEC_PER_USEC;
+       period = ktime_to_ns(tg->cfs_bandwidth.period);
+       quota = tg->cfs_bandwidth.quota;
+
+       return tg_set_cfs_bandwidth(tg, period, quota, burst);
+}
+
+static long tg_get_cfs_burst(struct task_group *tg)
+{
+       u64 burst_us;
+
+       burst_us = tg->cfs_bandwidth.burst;
+       do_div(burst_us, NSEC_PER_USEC);
+
+       return burst_us;
+}
+
 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
                                  struct cftype *cft)
 {
@@ -9118,6 +10045,18 @@ static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
        return tg_set_cfs_period(css_tg(css), cfs_period_us);
 }
 
+static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css,
+                                 struct cftype *cft)
+{
+       return tg_get_cfs_burst(css_tg(css));
+}
+
+static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css,
+                                  struct cftype *cftype, u64 cfs_burst_us)
+{
+       return tg_set_cfs_burst(css_tg(css), cfs_burst_us);
+}
+
 struct cfs_schedulable_data {
        struct task_group *tg;
        u64 period, quota;
@@ -9270,6 +10209,11 @@ static struct cftype cpu_legacy_files[] = {
                .read_u64 = cpu_cfs_period_read_u64,
                .write_u64 = cpu_cfs_period_write_u64,
        },
+       {
+               .name = "cfs_burst_us",
+               .read_u64 = cpu_cfs_burst_read_u64,
+               .write_u64 = cpu_cfs_burst_write_u64,
+       },
        {
                .name = "stat",
                .seq_show = cpu_cfs_stat_show,
@@ -9435,12 +10379,13 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
 {
        struct task_group *tg = css_tg(of_css(of));
        u64 period = tg_get_cfs_period(tg);
+       u64 burst = tg_get_cfs_burst(tg);
        u64 quota;
        int ret;
 
        ret = cpu_period_quota_parse(buf, &period, &quota);
        if (!ret)
-               ret = tg_set_cfs_bandwidth(tg, period, quota);
+               ret = tg_set_cfs_bandwidth(tg, period, quota, burst);
        return ret ?: nbytes;
 }
 #endif
@@ -9467,6 +10412,12 @@ static struct cftype cpu_files[] = {
                .seq_show = cpu_max_show,
                .write = cpu_max_write,
        },
+       {
+               .name = "max.burst",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .read_u64 = cpu_cfs_burst_read_u64,
+               .write_u64 = cpu_cfs_burst_write_u64,
+       },
 #endif
 #ifdef CONFIG_UCLAMP_TASK_GROUP
        {