X-Git-Url: http://git.monstr.eu/?a=blobdiff_plain;f=kernel%2Fsched%2Fcore.c;h=4a2a448d8dd407a04ab4dfd9a94447a1d0acf739;hb=54a728dc5e4feb0a9278ad62b19f34ad21ed0ee4;hp=5226cc26a095f427dafa864ff37d39d529e2ba85;hpb=8ac91e6c6033ebc12c5c1e4aa171b81a662bd70f;p=linux-2.6-microblaze.git diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5226cc26a095..4a2a448d8dd4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -84,6 +84,272 @@ unsigned int sysctl_sched_rt_period = 1000000; __read_mostly int scheduler_running; +#ifdef CONFIG_SCHED_CORE + +DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); + +/* kernel prio, less is more */ +static inline int __task_prio(struct task_struct *p) +{ + if (p->sched_class == &stop_sched_class) /* trumps deadline */ + return -2; + + if (rt_prio(p->prio)) /* includes deadline */ + return p->prio; /* [-1, 99] */ + + if (p->sched_class == &idle_sched_class) + return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ + + return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ +} + +/* + * l(a,b) + * le(a,b) := !l(b,a) + * g(a,b) := l(b,a) + * ge(a,b) := !l(a,b) + */ + +/* real prio, less is less */ +static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) +{ + + int pa = __task_prio(a), pb = __task_prio(b); + + if (-pa < -pb) + return true; + + if (-pb < -pa) + return false; + + if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ + return !dl_time_before(a->dl.deadline, b->dl.deadline); + + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ + return cfs_prio_less(a, b, in_fi); + + return false; +} + +static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b) +{ + if (a->core_cookie < b->core_cookie) + return true; + + if (a->core_cookie > b->core_cookie) + return false; + + /* flip prio, so high prio is leftmost */ + if (prio_less(b, a, task_rq(a)->core->core_forceidle)) + return true; + + return false; +} + +#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node) + +static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b) +{ + return __sched_core_less(__node_2_sc(a), __node_2_sc(b)); +} + +static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) +{ + const struct task_struct *p = __node_2_sc(node); + unsigned long cookie = (unsigned long)key; + + if (cookie < p->core_cookie) + return -1; + + if (cookie > p->core_cookie) + return 1; + + return 0; +} + +void sched_core_enqueue(struct rq *rq, struct task_struct *p) +{ + rq->core->core_task_seq++; + + if (!p->core_cookie) + return; + + rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less); +} + +void sched_core_dequeue(struct rq *rq, struct task_struct *p) +{ + rq->core->core_task_seq++; + + if (!sched_core_enqueued(p)) + return; + + rb_erase(&p->core_node, &rq->core_tree); + RB_CLEAR_NODE(&p->core_node); +} + +/* + * Find left-most (aka, highest priority) task matching @cookie. + */ +static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie) +{ + struct rb_node *node; + + node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp); + /* + * The idle task always matches any cookie! + */ + if (!node) + return idle_sched_class.pick_task(rq); + + return __node_2_sc(node); +} + +static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie) +{ + struct rb_node *node = &p->core_node; + + node = rb_next(node); + if (!node) + return NULL; + + p = container_of(node, struct task_struct, core_node); + if (p->core_cookie != cookie) + return NULL; + + return p; +} + +/* + * Magic required such that: + * + * raw_spin_rq_lock(rq); + * ... + * raw_spin_rq_unlock(rq); + * + * ends up locking and unlocking the _same_ lock, and all CPUs + * always agree on what rq has what lock. + * + * XXX entirely possible to selectively enable cores, don't bother for now. + */ + +static DEFINE_MUTEX(sched_core_mutex); +static atomic_t sched_core_count; +static struct cpumask sched_core_mask; + +static void __sched_core_flip(bool enabled) +{ + int cpu, t, i; + + cpus_read_lock(); + + /* + * Toggle the online cores, one by one. + */ + cpumask_copy(&sched_core_mask, cpu_online_mask); + for_each_cpu(cpu, &sched_core_mask) { + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + + i = 0; + local_irq_disable(); + for_each_cpu(t, smt_mask) { + /* supports up to SMT8 */ + raw_spin_lock_nested(&cpu_rq(t)->__lock, i++); + } + + for_each_cpu(t, smt_mask) + cpu_rq(t)->core_enabled = enabled; + + for_each_cpu(t, smt_mask) + raw_spin_unlock(&cpu_rq(t)->__lock); + local_irq_enable(); + + cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); + } + + /* + * Toggle the offline CPUs. + */ + cpumask_copy(&sched_core_mask, cpu_possible_mask); + cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask); + + for_each_cpu(cpu, &sched_core_mask) + cpu_rq(cpu)->core_enabled = enabled; + + cpus_read_unlock(); +} + +static void sched_core_assert_empty(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree)); +} + +static void __sched_core_enable(void) +{ + static_branch_enable(&__sched_core_enabled); + /* + * Ensure all previous instances of raw_spin_rq_*lock() have finished + * and future ones will observe !sched_core_disabled(). + */ + synchronize_rcu(); + __sched_core_flip(true); + sched_core_assert_empty(); +} + +static void __sched_core_disable(void) +{ + sched_core_assert_empty(); + __sched_core_flip(false); + static_branch_disable(&__sched_core_enabled); +} + +void sched_core_get(void) +{ + if (atomic_inc_not_zero(&sched_core_count)) + return; + + mutex_lock(&sched_core_mutex); + if (!atomic_read(&sched_core_count)) + __sched_core_enable(); + + smp_mb__before_atomic(); + atomic_inc(&sched_core_count); + mutex_unlock(&sched_core_mutex); +} + +static void __sched_core_put(struct work_struct *work) +{ + if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) { + __sched_core_disable(); + mutex_unlock(&sched_core_mutex); + } +} + +void sched_core_put(void) +{ + static DECLARE_WORK(_work, __sched_core_put); + + /* + * "There can be only one" + * + * Either this is the last one, or we don't actually need to do any + * 'work'. If it is the last *again*, we rely on + * WORK_STRUCT_PENDING_BIT. + */ + if (!atomic_add_unless(&sched_core_count, -1, 1)) + schedule_work(&_work); +} + +#else /* !CONFIG_SCHED_CORE */ + +static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } +static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { } + +#endif /* CONFIG_SCHED_CORE */ + /* * part of the period that we allow rt tasks to run in us. * default: 0.95s @@ -184,6 +450,79 @@ int sysctl_sched_rt_runtime = 950000; * */ +void raw_spin_rq_lock_nested(struct rq *rq, int subclass) +{ + raw_spinlock_t *lock; + + /* Matches synchronize_rcu() in __sched_core_enable() */ + preempt_disable(); + if (sched_core_disabled()) { + raw_spin_lock_nested(&rq->__lock, subclass); + /* preempt_count *MUST* be > 1 */ + preempt_enable_no_resched(); + return; + } + + for (;;) { + lock = __rq_lockp(rq); + raw_spin_lock_nested(lock, subclass); + if (likely(lock == __rq_lockp(rq))) { + /* preempt_count *MUST* be > 1 */ + preempt_enable_no_resched(); + return; + } + raw_spin_unlock(lock); + } +} + +bool raw_spin_rq_trylock(struct rq *rq) +{ + raw_spinlock_t *lock; + bool ret; + + /* Matches synchronize_rcu() in __sched_core_enable() */ + preempt_disable(); + if (sched_core_disabled()) { + ret = raw_spin_trylock(&rq->__lock); + preempt_enable(); + return ret; + } + + for (;;) { + lock = __rq_lockp(rq); + ret = raw_spin_trylock(lock); + if (!ret || (likely(lock == __rq_lockp(rq)))) { + preempt_enable(); + return ret; + } + raw_spin_unlock(lock); + } +} + +void raw_spin_rq_unlock(struct rq *rq) +{ + raw_spin_unlock(rq_lockp(rq)); +} + +#ifdef CONFIG_SMP +/* + * double_rq_lock - safely lock two runqueues + */ +void double_rq_lock(struct rq *rq1, struct rq *rq2) +{ + lockdep_assert_irqs_disabled(); + + if (rq_order_less(rq2, rq1)) + swap(rq1, rq2); + + raw_spin_rq_lock(rq1); + if (__rq_lockp(rq1) == __rq_lockp(rq2)) + return; + + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); +} +#endif + /* * __task_rq_lock - lock the rq @p resides on. */ @@ -196,12 +535,12 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) for (;;) { rq = task_rq(p); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { rq_pin_lock(rq, rf); return rq; } - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); while (unlikely(task_on_rq_migrating(p))) cpu_relax(); @@ -220,7 +559,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) for (;;) { raw_spin_lock_irqsave(&p->pi_lock, rf->flags); rq = task_rq(p); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); /* * move_queued_task() task_rq_lock() * @@ -242,7 +581,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) rq_pin_lock(rq, rf); return rq; } - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); while (unlikely(task_on_rq_migrating(p))) @@ -312,7 +651,7 @@ void update_rq_clock(struct rq *rq) { s64 delta; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); if (rq->clock_update_flags & RQCF_ACT_SKIP) return; @@ -585,7 +924,6 @@ void wake_up_q(struct wake_q_head *head) struct task_struct *task; task = container_of(node, struct task_struct, wake_q); - BUG_ON(!task); /* Task can safely be re-inserted now: */ node = node->next; task->wake_q.next = NULL; @@ -611,7 +949,7 @@ void resched_curr(struct rq *rq) struct task_struct *curr = rq->curr; int cpu; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); if (test_tsk_need_resched(curr)) return; @@ -635,10 +973,10 @@ void resched_cpu(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags); if (cpu_online(cpu) || cpu == smp_processor_id()) resched_curr(rq); - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags); } #ifdef CONFIG_SMP @@ -1065,9 +1403,10 @@ static void uclamp_sync_util_min_rt_default(void) static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { + /* Copy by value as we could modify it */ struct uclamp_se uc_req = p->uclamp_req[clamp_id]; #ifdef CONFIG_UCLAMP_TASK_GROUP - struct uclamp_se uc_max; + unsigned int tg_min, tg_max, value; /* * Tasks in autogroups or root task group will be @@ -1078,9 +1417,11 @@ uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) if (task_group(p) == &root_task_group) return uc_req; - uc_max = task_group(p)->uclamp[clamp_id]; - if (uc_req.value > uc_max.value || !uc_req.user_defined) - return uc_max; + tg_min = task_group(p)->uclamp[UCLAMP_MIN].value; + tg_max = task_group(p)->uclamp[UCLAMP_MAX].value; + value = uc_req.value; + value = clamp(value, tg_min, tg_max); + uclamp_se_set(&uc_req, value, false); #endif return uc_req; @@ -1137,7 +1478,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, struct uclamp_se *uc_se = &p->uclamp[clamp_id]; struct uclamp_bucket *bucket; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); /* Update task effective clamp */ p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id); @@ -1177,7 +1518,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, unsigned int bkt_clamp; unsigned int rq_clamp; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); /* * If sched_uclamp_used was enabled after task @p was enqueued, @@ -1279,8 +1620,9 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) } static inline void -uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) +uclamp_update_active(struct task_struct *p) { + enum uclamp_id clamp_id; struct rq_flags rf; struct rq *rq; @@ -1300,9 +1642,11 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) * affecting a valid clamp bucket, the next time it's enqueued, * it will already see the updated clamp bucket value. */ - if (p->uclamp[clamp_id].active) { - uclamp_rq_dec_id(rq, p, clamp_id); - uclamp_rq_inc_id(rq, p, clamp_id); + for_each_clamp_id(clamp_id) { + if (p->uclamp[clamp_id].active) { + uclamp_rq_dec_id(rq, p, clamp_id); + uclamp_rq_inc_id(rq, p, clamp_id); + } } task_rq_unlock(rq, p, &rf); @@ -1310,20 +1654,14 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) #ifdef CONFIG_UCLAMP_TASK_GROUP static inline void -uclamp_update_active_tasks(struct cgroup_subsys_state *css, - unsigned int clamps) +uclamp_update_active_tasks(struct cgroup_subsys_state *css) { - enum uclamp_id clamp_id; struct css_task_iter it; struct task_struct *p; css_task_iter_start(css, 0, &it); - while ((p = css_task_iter_next(&it))) { - for_each_clamp_id(clamp_id) { - if ((0x1 << clamp_id) & clamps) - uclamp_update_active(p, clamp_id); - } - } + while ((p = css_task_iter_next(&it))) + uclamp_update_active(p); css_task_iter_end(&it); } @@ -1596,21 +1934,27 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) update_rq_clock(rq); if (!(flags & ENQUEUE_RESTORE)) { - sched_info_queued(rq, p); + sched_info_enqueue(rq, p); psi_enqueue(p, flags & ENQUEUE_WAKEUP); } uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); + + if (sched_core_enabled(rq)) + sched_core_enqueue(rq, p); } static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { + if (sched_core_enabled(rq)) + sched_core_dequeue(rq, p); + if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); if (!(flags & DEQUEUE_SAVE)) { - sched_info_dequeued(rq, p); + sched_info_dequeue(rq, p); psi_dequeue(p, flags & DEQUEUE_SLEEP); } @@ -1850,7 +2194,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int new_cpu) { - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); deactivate_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); @@ -1916,7 +2260,6 @@ static int migration_cpu_stop(void *data) struct migration_arg *arg = data; struct set_affinity_pending *pending = arg->pending; struct task_struct *p = arg->task; - int dest_cpu = arg->dest_cpu; struct rq *rq = this_rq(); bool complete = false; struct rq_flags rf; @@ -1954,19 +2297,15 @@ static int migration_cpu_stop(void *data) if (pending) { p->migration_pending = NULL; complete = true; - } - if (dest_cpu < 0) { if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) goto out; - - dest_cpu = cpumask_any_distribute(&p->cpus_mask); } if (task_on_rq_queued(p)) - rq = __migrate_task(rq, &rf, p, dest_cpu); + rq = __migrate_task(rq, &rf, p, arg->dest_cpu); else - p->wake_cpu = dest_cpu; + p->wake_cpu = arg->dest_cpu; /* * XXX __migrate_task() can fail, at which point we might end @@ -2024,7 +2363,7 @@ int push_cpu_stop(void *arg) struct task_struct *p = arg; raw_spin_lock_irq(&p->pi_lock); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); if (task_rq(p) != rq) goto out_unlock; @@ -2054,7 +2393,7 @@ int push_cpu_stop(void *arg) out_unlock: rq->push_busy = false; - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); @@ -2107,7 +2446,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 * Because __kthread_bind() calls this on blocked tasks without * holding rq->lock. */ - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); } if (running) @@ -2249,7 +2588,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag init_completion(&my_pending.done); my_pending.arg = (struct migration_arg) { .task = p, - .dest_cpu = -1, /* any */ + .dest_cpu = dest_cpu, .pending = &my_pending, }; @@ -2257,6 +2596,15 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag } else { pending = p->migration_pending; refcount_inc(&pending->refs); + /* + * Affinity has changed, but we've already installed a + * pending. migration_cpu_stop() *must* see this, else + * we risk a completion of the pending despite having a + * task on a disallowed CPU. + * + * Serialized by p->pi_lock, so this is safe. + */ + pending->arg.dest_cpu = dest_cpu; } } pending = p->migration_pending; @@ -2277,7 +2625,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag return -EINVAL; } - if (task_running(rq, p) || p->state == TASK_WAKING) { + if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) { /* * MIGRATE_ENABLE gets here because 'p == current', but for * anything else we cannot do is_migration_disabled(), punt @@ -2420,19 +2768,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG + unsigned int state = READ_ONCE(p->__state); + /* * We should never call set_task_cpu() on a blocked task, * ttwu() will sort out the placement. */ - WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !p->on_rq); + WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq); /* * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, * because schedstat_wait_{start,end} rebase migrating task's wait_start * time relying on p->on_rq. */ - WARN_ON_ONCE(p->state == TASK_RUNNING && + WARN_ON_ONCE(state == TASK_RUNNING && p->sched_class == &fair_sched_class && (p->on_rq && !task_on_rq_migrating(p))); @@ -2448,7 +2797,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * task_rq_lock(). */ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock))); + lockdep_is_held(__rq_lockp(task_rq(p))))); #endif /* * Clearly, migrating tasks to offline CPUs is a fairly daft thing. @@ -2604,7 +2953,7 @@ out: * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -unsigned long wait_task_inactive(struct task_struct *p, long match_state) +unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state) { int running, queued; struct rq_flags rf; @@ -2632,7 +2981,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * is actually now running somewhere else! */ while (task_running(rq, p)) { - if (match_state && unlikely(p->state != match_state)) + if (match_state && unlikely(READ_ONCE(p->__state) != match_state)) return 0; cpu_relax(); } @@ -2647,7 +2996,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) running = task_running(rq, p); queued = task_on_rq_queued(p); ncsw = 0; - if (!match_state || p->state == match_state) + if (!match_state || READ_ONCE(p->__state) == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, p, &rf); @@ -2956,7 +3305,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, struct rq_flags *rf) { check_preempt_curr(rq, p, wake_flags); - p->state = TASK_RUNNING; + WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); #ifdef CONFIG_SMP @@ -2979,6 +3328,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, if (rq->avg_idle > max) rq->avg_idle = max; + rq->wake_stamp = jiffies; + rq->wake_avg_idle = rq->avg_idle / 2; + rq->idle_stamp = 0; } #endif @@ -2990,7 +3342,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, { int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); if (p->sched_contributes_to_load) rq->nr_uninterruptible--; @@ -3345,12 +3697,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ - if (!(p->state & state)) + if (!(READ_ONCE(p->__state) & state)) goto out; success = 1; trace_sched_waking(p); - p->state = TASK_RUNNING; + WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); goto out; } @@ -3363,7 +3715,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); - if (!(p->state & state)) + if (!(READ_ONCE(p->__state) & state)) goto unlock; trace_sched_waking(p); @@ -3429,7 +3781,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * TASK_WAKING such that we can unlock p->pi_lock before doing the * enqueue, such as ttwu_queue_wakelist(). */ - p->state = TASK_WAKING; + WRITE_ONCE(p->__state, TASK_WAKING); /* * If the owning (remote) CPU is still in the middle of schedule() with @@ -3522,7 +3874,7 @@ bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct t ret = func(p, arg); rq_unlock(rq, &rf); } else { - switch (p->state) { + switch (READ_ONCE(p->__state)) { case TASK_RUNNING: case TASK_WAKING: break; @@ -3648,7 +4000,6 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, #ifdef CONFIG_SCHEDSTATS DEFINE_STATIC_KEY_FALSE(sched_schedstats); -static bool __initdata __sched_schedstats = false; static void set_schedstats(bool enabled) { @@ -3672,16 +4023,11 @@ static int __init setup_schedstats(char *str) if (!str) goto out; - /* - * This code is called before jump labels have been set up, so we can't - * change the static branch directly just yet. Instead set a temporary - * variable so init_schedstats() can do it later. - */ if (!strcmp(str, "enable")) { - __sched_schedstats = true; + set_schedstats(true); ret = 1; } else if (!strcmp(str, "disable")) { - __sched_schedstats = false; + set_schedstats(false); ret = 1; } out: @@ -3692,11 +4038,6 @@ out: } __setup("schedstats=", setup_schedstats); -static void __init init_schedstats(void) -{ - set_schedstats(__sched_schedstats); -} - #ifdef CONFIG_PROC_SYSCTL int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -3718,8 +4059,6 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, return err; } #endif /* CONFIG_PROC_SYSCTL */ -#else /* !CONFIG_SCHEDSTATS */ -static inline void init_schedstats(void) {} #endif /* CONFIG_SCHEDSTATS */ /* @@ -3735,7 +4074,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ - p->state = TASK_NEW; + p->__state = TASK_NEW; /* * Make sure we do not leak PI boosting priority to the child. @@ -3841,7 +4180,7 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); - p->state = TASK_RUNNING; + WRITE_ONCE(p->__state, TASK_RUNNING); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@ -4001,7 +4340,7 @@ static void do_balance_callbacks(struct rq *rq, struct callback_head *head) void (*func)(struct rq *rq); struct callback_head *next; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); while (head) { func = (void (*)(struct rq *))head->func; @@ -4024,7 +4363,7 @@ static inline struct callback_head *splice_balance_callbacks(struct rq *rq) { struct callback_head *head = rq->balance_callback; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); if (head) rq->balance_callback = NULL; @@ -4041,9 +4380,9 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head) unsigned long flags; if (unlikely(head)) { - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_rq_lock_irqsave(rq, flags); do_balance_callbacks(rq, head); - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_rq_unlock_irqrestore(rq, flags); } } @@ -4074,10 +4413,10 @@ prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf * do an early lockdep release here: */ rq_unpin_lock(rq, rf); - spin_release(&rq->lock.dep_map, _THIS_IP_); + spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_); #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = next; + rq_lockp(rq)->owner = next; #endif } @@ -4088,9 +4427,9 @@ static inline void finish_lock_switch(struct rq *rq) * fix up the runqueue lock - which gets 'carried over' from * prev into current: */ - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_); __balance_callbacks(rq); - raw_spin_unlock_irq(&rq->lock); + raw_spin_rq_unlock_irq(rq); } /* @@ -4203,7 +4542,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) * running on another CPU and we could rave with its RUNNING -> DEAD * transition, resulting in a double drop. */ - prev_state = prev->state; + prev_state = READ_ONCE(prev->__state); vtime_task_switch(prev); perf_event_task_sched_in(prev, current); finish_task(prev); @@ -4348,9 +4687,9 @@ context_switch(struct rq *rq, struct task_struct *prev, * externally visible scheduler statistics: current number of runnable * threads, total number of context switches performed since bootup. */ -unsigned long nr_running(void) +unsigned int nr_running(void) { - unsigned long i, sum = 0; + unsigned int i, sum = 0; for_each_online_cpu(i) sum += cpu_rq(i)->nr_running; @@ -4395,7 +4734,7 @@ unsigned long long nr_context_switches(void) * it does become runnable. */ -unsigned long nr_iowait_cpu(int cpu) +unsigned int nr_iowait_cpu(int cpu) { return atomic_read(&cpu_rq(cpu)->nr_iowait); } @@ -4430,9 +4769,9 @@ unsigned long nr_iowait_cpu(int cpu) * Task CPU affinities can make all that even more 'interesting'. */ -unsigned long nr_iowait(void) +unsigned int nr_iowait(void) { - unsigned long i, sum = 0; + unsigned int i, sum = 0; for_each_possible_cpu(i) sum += nr_iowait_cpu(i); @@ -4897,7 +5236,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP - if (!preempt && prev->state && prev->non_block_count) { + if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) { printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", prev->comm, prev->pid, prev->non_block_count); dump_stack(); @@ -4943,7 +5282,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { const struct sched_class *class; struct task_struct *p; @@ -4961,7 +5300,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (unlikely(p == RETRY_TASK)) goto restart; - /* Assumes fair_sched_class->next == idle_sched_class */ + /* Assume the next prioritized class is idle_sched_class */ if (!p) { put_prev_task(rq, prev); p = pick_next_task_idle(rq); @@ -4983,6 +5322,455 @@ restart: BUG(); } +#ifdef CONFIG_SCHED_CORE +static inline bool is_task_rq_idle(struct task_struct *t) +{ + return (task_rq(t)->idle == t); +} + +static inline bool cookie_equals(struct task_struct *a, unsigned long cookie) +{ + return is_task_rq_idle(a) || (a->core_cookie == cookie); +} + +static inline bool cookie_match(struct task_struct *a, struct task_struct *b) +{ + if (is_task_rq_idle(a) || is_task_rq_idle(b)) + return true; + + return a->core_cookie == b->core_cookie; +} + +// XXX fairness/fwd progress conditions +/* + * Returns + * - NULL if there is no runnable task for this class. + * - the highest priority task for this runqueue if it matches + * rq->core->core_cookie or its priority is greater than max. + * - Else returns idle_task. + */ +static struct task_struct * +pick_task(struct rq *rq, const struct sched_class *class, struct task_struct *max, bool in_fi) +{ + struct task_struct *class_pick, *cookie_pick; + unsigned long cookie = rq->core->core_cookie; + + class_pick = class->pick_task(rq); + if (!class_pick) + return NULL; + + if (!cookie) { + /* + * If class_pick is tagged, return it only if it has + * higher priority than max. + */ + if (max && class_pick->core_cookie && + prio_less(class_pick, max, in_fi)) + return idle_sched_class.pick_task(rq); + + return class_pick; + } + + /* + * If class_pick is idle or matches cookie, return early. + */ + if (cookie_equals(class_pick, cookie)) + return class_pick; + + cookie_pick = sched_core_find(rq, cookie); + + /* + * If class > max && class > cookie, it is the highest priority task on + * the core (so far) and it must be selected, otherwise we must go with + * the cookie pick in order to satisfy the constraint. + */ + if (prio_less(cookie_pick, class_pick, in_fi) && + (!max || prio_less(max, class_pick, in_fi))) + return class_pick; + + return cookie_pick; +} + +extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi); + +static struct task_struct * +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + struct task_struct *next, *max = NULL; + const struct sched_class *class; + const struct cpumask *smt_mask; + bool fi_before = false; + int i, j, cpu, occ = 0; + bool need_sync; + + if (!sched_core_enabled(rq)) + return __pick_next_task(rq, prev, rf); + + cpu = cpu_of(rq); + + /* Stopper task is switching into idle, no need core-wide selection. */ + if (cpu_is_offline(cpu)) { + /* + * Reset core_pick so that we don't enter the fastpath when + * coming online. core_pick would already be migrated to + * another cpu during offline. + */ + rq->core_pick = NULL; + return __pick_next_task(rq, prev, rf); + } + + /* + * If there were no {en,de}queues since we picked (IOW, the task + * pointers are all still valid), and we haven't scheduled the last + * pick yet, do so now. + * + * rq->core_pick can be NULL if no selection was made for a CPU because + * it was either offline or went offline during a sibling's core-wide + * selection. In this case, do a core-wide selection. + */ + if (rq->core->core_pick_seq == rq->core->core_task_seq && + rq->core->core_pick_seq != rq->core_sched_seq && + rq->core_pick) { + WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq); + + next = rq->core_pick; + if (next != prev) { + put_prev_task(rq, prev); + set_next_task(rq, next); + } + + rq->core_pick = NULL; + return next; + } + + put_prev_task_balance(rq, prev, rf); + + smt_mask = cpu_smt_mask(cpu); + need_sync = !!rq->core->core_cookie; + + /* reset state */ + rq->core->core_cookie = 0UL; + if (rq->core->core_forceidle) { + need_sync = true; + fi_before = true; + rq->core->core_forceidle = false; + } + + /* + * core->core_task_seq, core->core_pick_seq, rq->core_sched_seq + * + * @task_seq guards the task state ({en,de}queues) + * @pick_seq is the @task_seq we did a selection on + * @sched_seq is the @pick_seq we scheduled + * + * However, preemptions can cause multiple picks on the same task set. + * 'Fix' this by also increasing @task_seq for every pick. + */ + rq->core->core_task_seq++; + + /* + * Optimize for common case where this CPU has no cookies + * and there are no cookied tasks running on siblings. + */ + if (!need_sync) { + for_each_class(class) { + next = class->pick_task(rq); + if (next) + break; + } + + if (!next->core_cookie) { + rq->core_pick = NULL; + /* + * For robustness, update the min_vruntime_fi for + * unconstrained picks as well. + */ + WARN_ON_ONCE(fi_before); + task_vruntime_update(rq, next, false); + goto done; + } + } + + for_each_cpu(i, smt_mask) { + struct rq *rq_i = cpu_rq(i); + + rq_i->core_pick = NULL; + + if (i != cpu) + update_rq_clock(rq_i); + } + + /* + * Try and select tasks for each sibling in descending sched_class + * order. + */ + for_each_class(class) { +again: + for_each_cpu_wrap(i, smt_mask, cpu) { + struct rq *rq_i = cpu_rq(i); + struct task_struct *p; + + if (rq_i->core_pick) + continue; + + /* + * If this sibling doesn't yet have a suitable task to + * run; ask for the most eligible task, given the + * highest priority task already selected for this + * core. + */ + p = pick_task(rq_i, class, max, fi_before); + if (!p) + continue; + + if (!is_task_rq_idle(p)) + occ++; + + rq_i->core_pick = p; + if (rq_i->idle == p && rq_i->nr_running) { + rq->core->core_forceidle = true; + if (!fi_before) + rq->core->core_forceidle_seq++; + } + + /* + * If this new candidate is of higher priority than the + * previous; and they're incompatible; we need to wipe + * the slate and start over. pick_task makes sure that + * p's priority is more than max if it doesn't match + * max's cookie. + * + * NOTE: this is a linear max-filter and is thus bounded + * in execution time. + */ + if (!max || !cookie_match(max, p)) { + struct task_struct *old_max = max; + + rq->core->core_cookie = p->core_cookie; + max = p; + + if (old_max) { + rq->core->core_forceidle = false; + for_each_cpu(j, smt_mask) { + if (j == i) + continue; + + cpu_rq(j)->core_pick = NULL; + } + occ = 1; + goto again; + } + } + } + } + + rq->core->core_pick_seq = rq->core->core_task_seq; + next = rq->core_pick; + rq->core_sched_seq = rq->core->core_pick_seq; + + /* Something should have been selected for current CPU */ + WARN_ON_ONCE(!next); + + /* + * Reschedule siblings + * + * NOTE: L1TF -- at this point we're no longer running the old task and + * sending an IPI (below) ensures the sibling will no longer be running + * their task. This ensures there is no inter-sibling overlap between + * non-matching user state. + */ + for_each_cpu(i, smt_mask) { + struct rq *rq_i = cpu_rq(i); + + /* + * An online sibling might have gone offline before a task + * could be picked for it, or it might be offline but later + * happen to come online, but its too late and nothing was + * picked for it. That's Ok - it will pick tasks for itself, + * so ignore it. + */ + if (!rq_i->core_pick) + continue; + + /* + * Update for new !FI->FI transitions, or if continuing to be in !FI: + * fi_before fi update? + * 0 0 1 + * 0 1 1 + * 1 0 1 + * 1 1 0 + */ + if (!(fi_before && rq->core->core_forceidle)) + task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle); + + rq_i->core_pick->core_occupation = occ; + + if (i == cpu) { + rq_i->core_pick = NULL; + continue; + } + + /* Did we break L1TF mitigation requirements? */ + WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick)); + + if (rq_i->curr == rq_i->core_pick) { + rq_i->core_pick = NULL; + continue; + } + + resched_curr(rq_i); + } + +done: + set_next_task(rq, next); + return next; +} + +static bool try_steal_cookie(int this, int that) +{ + struct rq *dst = cpu_rq(this), *src = cpu_rq(that); + struct task_struct *p; + unsigned long cookie; + bool success = false; + + local_irq_disable(); + double_rq_lock(dst, src); + + cookie = dst->core->core_cookie; + if (!cookie) + goto unlock; + + if (dst->curr != dst->idle) + goto unlock; + + p = sched_core_find(src, cookie); + if (p == src->idle) + goto unlock; + + do { + if (p == src->core_pick || p == src->curr) + goto next; + + if (!cpumask_test_cpu(this, &p->cpus_mask)) + goto next; + + if (p->core_occupation > dst->idle->core_occupation) + goto next; + + p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(src, p, 0); + set_task_cpu(p, this); + activate_task(dst, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; + + resched_curr(dst); + + success = true; + break; + +next: + p = sched_core_next(p, cookie); + } while (p); + +unlock: + double_rq_unlock(dst, src); + local_irq_enable(); + + return success; +} + +static bool steal_cookie_task(int cpu, struct sched_domain *sd) +{ + int i; + + for_each_cpu_wrap(i, sched_domain_span(sd), cpu) { + if (i == cpu) + continue; + + if (need_resched()) + break; + + if (try_steal_cookie(cpu, i)) + return true; + } + + return false; +} + +static void sched_core_balance(struct rq *rq) +{ + struct sched_domain *sd; + int cpu = cpu_of(rq); + + preempt_disable(); + rcu_read_lock(); + raw_spin_rq_unlock_irq(rq); + for_each_domain(cpu, sd) { + if (need_resched()) + break; + + if (steal_cookie_task(cpu, sd)) + break; + } + raw_spin_rq_lock_irq(rq); + rcu_read_unlock(); + preempt_enable(); +} + +static DEFINE_PER_CPU(struct callback_head, core_balance_head); + +void queue_core_balance(struct rq *rq) +{ + if (!sched_core_enabled(rq)) + return; + + if (!rq->core->core_cookie) + return; + + if (!rq->nr_running) /* not forced idle */ + return; + + queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); +} + +static inline void sched_core_cpu_starting(unsigned int cpu) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu); + struct rq *rq, *core_rq = NULL; + int i; + + core_rq = cpu_rq(cpu)->core; + + if (!core_rq) { + for_each_cpu(i, smt_mask) { + rq = cpu_rq(i); + if (rq->core && rq->core == rq) + core_rq = rq; + } + + if (!core_rq) + core_rq = cpu_rq(cpu); + + for_each_cpu(i, smt_mask) { + rq = cpu_rq(i); + + WARN_ON_ONCE(rq->core && rq->core != core_rq); + rq->core = core_rq; + } + } +} +#else /* !CONFIG_SCHED_CORE */ + +static inline void sched_core_cpu_starting(unsigned int cpu) {} + +static struct task_struct * +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + return __pick_next_task(rq, prev, rf); +} + +#endif /* CONFIG_SCHED_CORE */ + /* * __schedule() is the main scheduler function. * @@ -5074,10 +5862,10 @@ static void __sched notrace __schedule(bool preempt) * - we form a control dependency vs deactivate_task() below. * - ptrace_{,un}freeze_traced() can change ->state underneath us. */ - prev_state = prev->state; + prev_state = READ_ONCE(prev->__state); if (!preempt && prev_state) { if (signal_pending_state(prev_state, prev)) { - prev->state = TASK_RUNNING; + WRITE_ONCE(prev->__state, TASK_RUNNING); } else { prev->sched_contributes_to_load = (prev_state & TASK_UNINTERRUPTIBLE) && @@ -5150,7 +5938,7 @@ static void __sched notrace __schedule(bool preempt) rq_unpin_lock(rq, &rf); __balance_callbacks(rq); - raw_spin_unlock_irq(&rq->lock); + raw_spin_rq_unlock_irq(rq); } } @@ -5174,7 +5962,7 @@ static inline void sched_submit_work(struct task_struct *tsk) { unsigned int task_flags; - if (!tsk->state) + if (task_is_running(tsk)) return; task_flags = tsk->flags; @@ -5249,7 +6037,7 @@ void __sched schedule_idle(void) * current task can be in any other state. Note, idle is always in the * TASK_RUNNING state. */ - WARN_ON_ONCE(current->state); + WARN_ON_ONCE(current->__state); do { __schedule(false); } while (need_resched()); @@ -5692,7 +6480,7 @@ out_unlock: rq_unpin_lock(rq, &rf); __balance_callbacks(rq); - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); preempt_enable(); } @@ -6389,7 +7177,6 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, false, true); } -EXPORT_SYMBOL_GPL(sched_setattr_nocheck); /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. @@ -7149,7 +7936,7 @@ again: if (curr->sched_class != p->sched_class) goto out_unlock; - if (task_running(p_rq, p) || p->state) + if (task_running(p_rq, p) || !task_is_running(p)) goto out_unlock; yielded = curr->sched_class->yield_to_task(rq, p); @@ -7352,7 +8139,7 @@ void sched_show_task(struct task_struct *p) pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); - if (p->state == TASK_RUNNING) + if (task_is_running(p)) pr_cont(" running task "); #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); @@ -7376,26 +8163,28 @@ EXPORT_SYMBOL_GPL(sched_show_task); static inline bool state_filter_match(unsigned long state_filter, struct task_struct *p) { + unsigned int state = READ_ONCE(p->__state); + /* no filter, everything matches */ if (!state_filter) return true; /* filter, but doesn't match */ - if (!(p->state & state_filter)) + if (!(state & state_filter)) return false; /* * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows * TASK_KILLABLE). */ - if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) + if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE) return false; return true; } -void show_state_filter(unsigned long state_filter) +void show_state_filter(unsigned int state_filter) { struct task_struct *g, *p; @@ -7434,19 +8223,32 @@ void show_state_filter(unsigned long state_filter) * NOTE: this function does not set the idle thread's NEED_RESCHED * flag, to make booting more robust. */ -void init_idle(struct task_struct *idle, int cpu) +void __init init_idle(struct task_struct *idle, int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; __sched_fork(0, idle); + /* + * The idle task doesn't need the kthread struct to function, but it + * is dressed up as a per-CPU kthread and thus needs to play the part + * if we want to avoid special-casing it in code that deals with per-CPU + * kthreads. + */ + set_kthread_struct(idle); + raw_spin_lock_irqsave(&idle->pi_lock, flags); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); - idle->state = TASK_RUNNING; + idle->__state = TASK_RUNNING; idle->se.exec_start = sched_clock(); - idle->flags |= PF_IDLE; + /* + * PF_KTHREAD should already be set at this point; regardless, make it + * look like a proper per-CPU kthread. + */ + idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY; + kthread_set_per_cpu(idle, cpu); scs_task_reset(idle); kasan_unpoison_task_stack(idle); @@ -7480,7 +8282,7 @@ void init_idle(struct task_struct *idle, int cpu) #ifdef CONFIG_SMP idle->on_cpu = 1; #endif - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ @@ -7646,7 +8448,7 @@ static void balance_push(struct rq *rq) { struct task_struct *push_task = rq->curr; - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); SCHED_WARN_ON(rq->cpu != smp_processor_id()); /* @@ -7663,12 +8465,8 @@ static void balance_push(struct rq *rq) /* * Both the cpu-hotplug and stop task are in this case and are * required to complete the hotplug process. - * - * XXX: the idle task does not match kthread_is_per_cpu() due to - * histerical raisins. */ - if (rq->idle == push_task || - kthread_is_per_cpu(push_task) || + if (kthread_is_per_cpu(push_task) || is_migration_disabled(push_task)) { /* @@ -7684,9 +8482,9 @@ static void balance_push(struct rq *rq) */ if (!rq->nr_running && !rq_has_pinned_tasks(rq) && rcuwait_active(&rq->hotplug_wait)) { - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); rcuwait_wake_up(&rq->hotplug_wait); - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); } return; } @@ -7696,7 +8494,7 @@ static void balance_push(struct rq *rq) * Temporarily drop rq->lock such that we can wake-up the stop task. * Both preemption and IRQs are still disabled. */ - raw_spin_unlock(&rq->lock); + raw_spin_rq_unlock(rq); stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, this_cpu_ptr(&push_work)); /* @@ -7704,7 +8502,7 @@ static void balance_push(struct rq *rq) * schedule(). The next pick is obviously going to be the stop task * which kthread_is_per_cpu() and will push this task away. */ - raw_spin_lock(&rq->lock); + raw_spin_rq_lock(rq); } static void balance_push_set(int cpu, bool on) @@ -7948,6 +8746,7 @@ static void sched_rq_cpu_starting(unsigned int cpu) int sched_cpu_starting(unsigned int cpu) { + sched_core_cpu_starting(cpu); sched_rq_cpu_starting(cpu); sched_tick_start(cpu); return 0; @@ -7994,7 +8793,7 @@ static void dump_rq_tasks(struct rq *rq, const char *loglvl) struct task_struct *g, *p; int cpu = cpu_of(rq); - lockdep_assert_held(&rq->lock); + lockdep_assert_rq_held(rq); printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running); for_each_process_thread(g, p) { @@ -8046,6 +8845,7 @@ void __init sched_init_smp(void) /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) BUG(); + current->flags &= ~PF_NO_SETAFFINITY; sched_init_granularity(); init_sched_rt_class(); @@ -8167,7 +8967,7 @@ void __init sched_init(void) struct rq *rq; rq = cpu_rq(i); - raw_spin_lock_init(&rq->lock); + raw_spin_lock_init(&rq->__lock); rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; @@ -8215,6 +9015,8 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; + rq->wake_stamp = jiffies; + rq->wake_avg_idle = rq->avg_idle; rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); @@ -8232,6 +9034,16 @@ void __init sched_init(void) #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); + +#ifdef CONFIG_SCHED_CORE + rq->core = NULL; + rq->core_pick = NULL; + rq->core_enabled = 0; + rq->core_tree = RB_ROOT; + rq->core_forceidle = false; + + rq->core_cookie = 0UL; +#endif } set_load_weight(&init_task, false); @@ -8258,8 +9070,6 @@ void __init sched_init(void) #endif init_sched_fair_class(); - init_schedstats(); - psi_init(); init_uclamp(); @@ -8277,15 +9087,15 @@ static inline int preempt_count_equals(int preempt_offset) void __might_sleep(const char *file, int line, int preempt_offset) { + unsigned int state = get_current_state(); /* * Blocking primitives will set (and therefore destroy) current->state, * since we will exit with TASK_RUNNING make sure we enter with it, * otherwise we will destroy state. */ - WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, + WARN_ONCE(state != TASK_RUNNING && current->task_state_change, "do not call blocking ops when !TASK_RUNNING; " - "state=%lx set at [<%p>] %pS\n", - current->state, + "state=%x set at [<%p>] %pS\n", state, (void *)current->task_state_change, (void *)current->task_state_change); @@ -8681,7 +9491,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) #ifdef CONFIG_UCLAMP_TASK_GROUP /* Propagate the effective uclamp value for the new group */ + mutex_lock(&uclamp_mutex); + rcu_read_lock(); cpu_util_update_eff(css); + rcu_read_unlock(); + mutex_unlock(&uclamp_mutex); #endif return 0; @@ -8742,7 +9556,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) * has happened. This would lead to problems with PELT, due to * move wanting to detach+attach while we're not attached yet. */ - if (task->state == TASK_NEW) + if (READ_ONCE(task->__state) == TASK_NEW) ret = -EINVAL; raw_spin_unlock_irq(&task->pi_lock); @@ -8771,6 +9585,9 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) enum uclamp_id clamp_id; unsigned int clamps; + lockdep_assert_held(&uclamp_mutex); + SCHED_WARN_ON(!rcu_read_lock_held()); + css_for_each_descendant_pre(css, top_css) { uc_parent = css_tg(css)->parent ? css_tg(css)->parent->uclamp : NULL; @@ -8803,7 +9620,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) } /* Immediately update descendants RUNNABLE tasks */ - uclamp_update_active_tasks(css, clamps); + uclamp_update_active_tasks(css); } } @@ -8962,7 +9779,8 @@ static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); -static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, + u64 burst) { int i, ret = 0, runtime_enabled, runtime_was_enabled; struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; @@ -8992,6 +9810,10 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) if (quota != RUNTIME_INF && quota > max_cfs_runtime) return -EINVAL; + if (quota != RUNTIME_INF && (burst > quota || + burst + quota > max_cfs_runtime)) + return -EINVAL; + /* * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). @@ -9013,6 +9835,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; + cfs_b->burst = burst; __refill_cfs_bandwidth_runtime(cfs_b); @@ -9046,9 +9869,10 @@ out_unlock: static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) { - u64 quota, period; + u64 quota, period, burst; period = ktime_to_ns(tg->cfs_bandwidth.period); + burst = tg->cfs_bandwidth.burst; if (cfs_quota_us < 0) quota = RUNTIME_INF; else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) @@ -9056,7 +9880,7 @@ static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) else return -EINVAL; - return tg_set_cfs_bandwidth(tg, period, quota); + return tg_set_cfs_bandwidth(tg, period, quota, burst); } static long tg_get_cfs_quota(struct task_group *tg) @@ -9074,15 +9898,16 @@ static long tg_get_cfs_quota(struct task_group *tg) static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) { - u64 quota, period; + u64 quota, period, burst; if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) return -EINVAL; period = (u64)cfs_period_us * NSEC_PER_USEC; quota = tg->cfs_bandwidth.quota; + burst = tg->cfs_bandwidth.burst; - return tg_set_cfs_bandwidth(tg, period, quota); + return tg_set_cfs_bandwidth(tg, period, quota, burst); } static long tg_get_cfs_period(struct task_group *tg) @@ -9095,6 +9920,30 @@ static long tg_get_cfs_period(struct task_group *tg) return cfs_period_us; } +static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us) +{ + u64 quota, period, burst; + + if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; + + burst = (u64)cfs_burst_us * NSEC_PER_USEC; + period = ktime_to_ns(tg->cfs_bandwidth.period); + quota = tg->cfs_bandwidth.quota; + + return tg_set_cfs_bandwidth(tg, period, quota, burst); +} + +static long tg_get_cfs_burst(struct task_group *tg) +{ + u64 burst_us; + + burst_us = tg->cfs_bandwidth.burst; + do_div(burst_us, NSEC_PER_USEC); + + return burst_us; +} + static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -9119,6 +9968,18 @@ static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, return tg_set_cfs_period(css_tg(css), cfs_period_us); } +static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return tg_get_cfs_burst(css_tg(css)); +} + +static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 cfs_burst_us) +{ + return tg_set_cfs_burst(css_tg(css), cfs_burst_us); +} + struct cfs_schedulable_data { struct task_group *tg; u64 period, quota; @@ -9271,6 +10132,11 @@ static struct cftype cpu_legacy_files[] = { .read_u64 = cpu_cfs_period_read_u64, .write_u64 = cpu_cfs_period_write_u64, }, + { + .name = "cfs_burst_us", + .read_u64 = cpu_cfs_burst_read_u64, + .write_u64 = cpu_cfs_burst_write_u64, + }, { .name = "stat", .seq_show = cpu_cfs_stat_show, @@ -9436,12 +10302,13 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, { struct task_group *tg = css_tg(of_css(of)); u64 period = tg_get_cfs_period(tg); + u64 burst = tg_get_cfs_burst(tg); u64 quota; int ret; ret = cpu_period_quota_parse(buf, &period, "a); if (!ret) - ret = tg_set_cfs_bandwidth(tg, period, quota); + ret = tg_set_cfs_bandwidth(tg, period, quota, burst); return ret ?: nbytes; } #endif @@ -9468,6 +10335,12 @@ static struct cftype cpu_files[] = { .seq_show = cpu_max_show, .write = cpu_max_write, }, + { + .name = "max.burst", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = cpu_cfs_burst_read_u64, + .write_u64 = cpu_cfs_burst_write_u64, + }, #endif #ifdef CONFIG_UCLAMP_TASK_GROUP {