static atomic_t sched_core_count;
static struct cpumask sched_core_mask;
+static void sched_core_lock(int cpu, unsigned long *flags)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ int t, i = 0;
+
+ local_irq_save(*flags);
+ for_each_cpu(t, smt_mask)
+ raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
+}
+
+static void sched_core_unlock(int cpu, unsigned long *flags)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ int t;
+
+ for_each_cpu(t, smt_mask)
+ raw_spin_unlock(&cpu_rq(t)->__lock);
+ local_irq_restore(*flags);
+}
+
static void __sched_core_flip(bool enabled)
{
- int cpu, t, i;
+ unsigned long flags;
+ int cpu, t;
cpus_read_lock();
for_each_cpu(cpu, &sched_core_mask) {
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
- i = 0;
- local_irq_disable();
- for_each_cpu(t, smt_mask) {
- /* supports up to SMT8 */
- raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
- }
+ sched_core_lock(cpu, &flags);
for_each_cpu(t, smt_mask)
cpu_rq(t)->core_enabled = enabled;
- for_each_cpu(t, smt_mask)
- raw_spin_unlock(&cpu_rq(t)->__lock);
- local_irq_enable();
+ sched_core_unlock(cpu, &flags);
cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
}
rq_unlock(rq, &rf);
}
+/*
+ * Invoked from try_to_wake_up() to check whether the task can be woken up.
+ *
+ * The caller holds p::pi_lock if p != current or has preemption
+ * disabled when p == current.
+ *
+ * The rules of PREEMPT_RT saved_state:
+ *
+ * The related locking code always holds p::pi_lock when updating
+ * p::saved_state, which means the code is fully serialized in both cases.
+ *
+ * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
+ * bits set. This allows to distinguish all wakeup scenarios.
+ */
+static __always_inline
+bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
+ WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
+ state != TASK_RTLOCK_WAIT);
+ }
+
+ if (READ_ONCE(p->__state) & state) {
+ *success = 1;
+ return true;
+ }
+
+#ifdef CONFIG_PREEMPT_RT
+ /*
+ * Saved state preserves the task state across blocking on
+ * an RT lock. If the state matches, set p::saved_state to
+ * TASK_RUNNING, but do not wake the task because it waits
+ * for a lock wakeup. Also indicate success because from
+ * the regular waker's point of view this has succeeded.
+ *
+ * After acquiring the lock the task will restore p::__state
+ * from p::saved_state which ensures that the regular
+ * wakeup is not lost. The restore will also set
+ * p::saved_state to TASK_RUNNING so any further tests will
+ * not result in false positives vs. @success
+ */
+ if (p->saved_state & state) {
+ p->saved_state = TASK_RUNNING;
+ *success = 1;
+ }
+#endif
+ return false;
+}
+
/*
* Notes on Program-Order guarantees on SMP systems.
*
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
- if (!(READ_ONCE(p->__state) & state))
+ if (!ttwu_state_match(p, state, &success))
goto out;
- success = 1;
trace_sched_waking(p);
WRITE_ONCE(p->__state, TASK_RUNNING);
trace_sched_wakeup(p);
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
smp_mb__after_spinlock();
- if (!(READ_ONCE(p->__state) & state))
+ if (!ttwu_state_match(p, state, &success))
goto unlock;
trace_sched_waking(p);
- /* We're going to change ->state: */
- success = 1;
-
/*
* Ensure we load p->on_rq _after_ p->state, otherwise it would
* be possible to, falsely, observe p->on_rq == 0 and get stuck
queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
}
-static inline void sched_core_cpu_starting(unsigned int cpu)
+static void sched_core_cpu_starting(unsigned int cpu)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
- struct rq *rq, *core_rq = NULL;
- int i;
+ struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+ unsigned long flags;
+ int t;
+
+ sched_core_lock(cpu, &flags);
- core_rq = cpu_rq(cpu)->core;
+ WARN_ON_ONCE(rq->core != rq);
- if (!core_rq) {
- for_each_cpu(i, smt_mask) {
- rq = cpu_rq(i);
- if (rq->core && rq->core == rq)
- core_rq = rq;
+ /* if we're the first, we'll be our own leader */
+ if (cpumask_weight(smt_mask) == 1)
+ goto unlock;
+
+ /* find the leader */
+ for_each_cpu(t, smt_mask) {
+ if (t == cpu)
+ continue;
+ rq = cpu_rq(t);
+ if (rq->core == rq) {
+ core_rq = rq;
+ break;
}
+ }
- if (!core_rq)
- core_rq = cpu_rq(cpu);
+ if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
+ goto unlock;
- for_each_cpu(i, smt_mask) {
- rq = cpu_rq(i);
+ /* install and validate core_rq */
+ for_each_cpu(t, smt_mask) {
+ rq = cpu_rq(t);
- WARN_ON_ONCE(rq->core && rq->core != core_rq);
+ if (t == cpu)
rq->core = core_rq;
- }
+
+ WARN_ON_ONCE(rq->core != core_rq);
}
+
+unlock:
+ sched_core_unlock(cpu, &flags);
+}
+
+static void sched_core_cpu_deactivate(unsigned int cpu)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+ unsigned long flags;
+ int t;
+
+ sched_core_lock(cpu, &flags);
+
+ /* if we're the last man standing, nothing to do */
+ if (cpumask_weight(smt_mask) == 1) {
+ WARN_ON_ONCE(rq->core != rq);
+ goto unlock;
+ }
+
+ /* if we're not the leader, nothing to do */
+ if (rq->core != rq)
+ goto unlock;
+
+ /* find a new leader */
+ for_each_cpu(t, smt_mask) {
+ if (t == cpu)
+ continue;
+ core_rq = cpu_rq(t);
+ break;
+ }
+
+ if (WARN_ON_ONCE(!core_rq)) /* impossible */
+ goto unlock;
+
+ /* copy the shared state to the new leader */
+ core_rq->core_task_seq = rq->core_task_seq;
+ core_rq->core_pick_seq = rq->core_pick_seq;
+ core_rq->core_cookie = rq->core_cookie;
+ core_rq->core_forceidle = rq->core_forceidle;
+ core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+
+ /* install new leader */
+ for_each_cpu(t, smt_mask) {
+ rq = cpu_rq(t);
+ rq->core = core_rq;
+ }
+
+unlock:
+ sched_core_unlock(cpu, &flags);
}
+
+static inline void sched_core_cpu_dying(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->core != rq)
+ rq->core = rq;
+}
+
#else /* !CONFIG_SCHED_CORE */
static inline void sched_core_cpu_starting(unsigned int cpu) {}
+static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
+static inline void sched_core_cpu_dying(unsigned int cpu) {}
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
#endif /* CONFIG_SCHED_CORE */
+/*
+ * Constants for the sched_mode argument of __schedule().
+ *
+ * The mode argument allows RT enabled kernels to differentiate a
+ * preemption from blocking on an 'sleeping' spin/rwlock. Note that
+ * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
+ * optimize the AND operation out and just check for zero.
+ */
+#define SM_NONE 0x0
+#define SM_PREEMPT 0x1
+#define SM_RTLOCK_WAIT 0x2
+
+#ifndef CONFIG_PREEMPT_RT
+# define SM_MASK_PREEMPT (~0U)
+#else
+# define SM_MASK_PREEMPT SM_PREEMPT
+#endif
+
/*
* __schedule() is the main scheduler function.
*
*
* WARNING: must be called with preemption disabled!
*/
-static void __sched notrace __schedule(bool preempt)
+static void __sched notrace __schedule(unsigned int sched_mode)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
rq = cpu_rq(cpu);
prev = rq->curr;
- schedule_debug(prev, preempt);
+ schedule_debug(prev, !!sched_mode);
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
local_irq_disable();
- rcu_note_context_switch(preempt);
+ rcu_note_context_switch(!!sched_mode);
/*
* Make sure that signal_pending_state()->signal_pending() below
* - ptrace_{,un}freeze_traced() can change ->state underneath us.
*/
prev_state = READ_ONCE(prev->__state);
- if (!preempt && prev_state) {
+ if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
if (signal_pending_state(prev_state, prev)) {
WRITE_ONCE(prev->__state, TASK_RUNNING);
} else {
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
- trace_sched_switch(preempt, prev, next);
+ trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
/* Tell freezer to ignore us: */
current->flags |= PF_NOFREEZE;
- __schedule(false);
+ __schedule(SM_NONE);
BUG();
/* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
sched_submit_work(tsk);
do {
preempt_disable();
- __schedule(false);
+ __schedule(SM_NONE);
sched_preempt_enable_no_resched();
} while (need_resched());
sched_update_worker(tsk);
*/
WARN_ON_ONCE(current->__state);
do {
- __schedule(false);
+ __schedule(SM_NONE);
} while (need_resched());
}
preempt_disable();
}
+#ifdef CONFIG_PREEMPT_RT
+void __sched notrace schedule_rtlock(void)
+{
+ do {
+ preempt_disable();
+ __schedule(SM_RTLOCK_WAIT);
+ sched_preempt_enable_no_resched();
+ } while (need_resched());
+}
+NOKPROBE_SYMBOL(schedule_rtlock);
+#endif
+
static void __sched notrace preempt_schedule_common(void)
{
do {
*/
preempt_disable_notrace();
preempt_latency_start(1);
- __schedule(true);
+ __schedule(SM_PREEMPT);
preempt_latency_stop(1);
preempt_enable_no_resched_notrace();
* an infinite recursion.
*/
prev_ctx = exception_enter();
- __schedule(true);
+ __schedule(SM_PREEMPT);
exception_exit(prev_ctx);
preempt_latency_stop(1);
do {
preempt_disable();
local_irq_enable();
- __schedule(true);
+ __schedule(SM_PREEMPT);
local_irq_disable();
sched_preempt_enable_no_resched();
} while (need_resched());
preempt_schedule_common();
return 1;
}
+ /*
+ * In preemptible kernels, ->rcu_read_lock_nesting tells the tick
+ * whether the current CPU is in an RCU read-side critical section,
+ * so the tick can report quiescent states even for CPUs looping
+ * in kernel context. In contrast, in non-preemptible kernels,
+ * RCU readers leave no in-memory hints, which means that CPU-bound
+ * processes executing in kernel context might never report an
+ * RCU quiescent state. Therefore, the following code causes
+ * cond_resched() to report a quiescent state, but only when RCU
+ * is in urgent need of one.
+ */
#ifndef CONFIG_PREEMPT_RCU
rcu_all_qs();
#endif
*/
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_dec_cpuslocked(&sched_smt_present);
+
+ sched_core_cpu_deactivate(cpu);
#endif
if (!sched_smp_initialized)
calc_load_migrate(rq);
update_max_interval();
hrtick_clear(rq);
+ sched_core_cpu_dying(cpu);
return 0;
}
#endif
atomic_set(&rq->nr_iowait, 0);
#ifdef CONFIG_SCHED_CORE
- rq->core = NULL;
+ rq->core = rq;
rq->core_pick = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;