*/
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
+ /* When not in the task's cpumask, no point in looking further. */
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
- if (is_per_cpu_kthread(p) || is_migration_disabled(p))
+ /* migrate_disabled() must be allowed to finish. */
+ if (is_migration_disabled(p))
return cpu_online(cpu);
- return cpu_active(cpu);
+ /* Non kernel threads are not allowed during either online or offline. */
+ if (!(p->flags & PF_KTHREAD))
+ return cpu_active(cpu);
+
+ /* KTHREAD_IS_PER_CPU is always allowed. */
+ if (kthread_is_per_cpu(p))
+ return cpu_online(cpu);
+
+ /* Regular kernel threads don't get to stay during offline. */
+ if (cpu_rq(cpu)->balance_push)
+ return false;
+
+ /* But are allowed during online. */
+ return cpu_online(cpu);
}
/*
if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
/*
- * Kernel threads are allowed on online && !active CPUs.
+ * Kernel threads are allowed on online && !active CPUs,
+ * however, during cpu-hot-unplug, even these might get pushed
+ * away if not KTHREAD_IS_PER_CPU.
*
* Specifically, migration_disabled() tasks must not fail the
* cpumask_any_and_distribute() pick below, esp. so on
__do_set_cpus_allowed(p, new_mask, flags);
- if (p->flags & PF_KTHREAD) {
- /*
- * For kernel threads that do indeed end up on online &&
- * !active we want to ensure they are strict per-CPU threads.
- */
- WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
- !cpumask_intersects(new_mask, cpu_active_mask) &&
- p->nr_cpus_allowed != 1);
- }
-
return affine_move_task(rq, p, &rf, dest_cpu, flags);
out:
static inline bool ttwu_queue_cond(int cpu, int wake_flags)
{
+ /*
+ * Do not complicate things with the async wake_list while the CPU is
+ * in hotplug state.
+ */
+ if (!cpu_active(cpu))
+ return false;
+
/*
* If the CPU does not share cache, then queue the task on the
* remote rqs wakelist to avoid accessing remote data.
return cpu_rq(cpu)->idle;
}
+#ifdef CONFIG_SMP
+/*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ * cpu_util_{cfs,rt,dl,irq}()
+ * cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the irq utilization.
+ *
+ * The DL bandwidth number otoh is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum cpu_util_type type,
+ struct task_struct *p)
+{
+ unsigned long dl_util, util, irq;
+ struct rq *rq = cpu_rq(cpu);
+
+ if (!uclamp_is_used() &&
+ type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
+ return max;
+ }
+
+ /*
+ * Early check to see if IRQ/steal time saturates the CPU, can be
+ * because of inaccuracies in how we track these -- see
+ * update_irq_load_avg().
+ */
+ irq = cpu_util_irq(rq);
+ if (unlikely(irq >= max))
+ return max;
+
+ /*
+ * Because the time spend on RT/DL tasks is visible as 'lost' time to
+ * CFS tasks and we use the same metric to track the effective
+ * utilization (PELT windows are synchronized) we can directly add them
+ * to obtain the CPU's actual utilization.
+ *
+ * CFS and RT utilization can be boosted or capped, depending on
+ * utilization clamp constraints requested by currently RUNNABLE
+ * tasks.
+ * When there are no CFS RUNNABLE tasks, clamps are released and
+ * frequency will be gracefully reduced with the utilization decay.
+ */
+ util = util_cfs + cpu_util_rt(rq);
+ if (type == FREQUENCY_UTIL)
+ util = uclamp_rq_util_with(rq, util, p);
+
+ dl_util = cpu_util_dl(rq);
+
+ /*
+ * For frequency selection we do not make cpu_util_dl() a permanent part
+ * of this sum because we want to use cpu_bw_dl() later on, but we need
+ * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
+ * that we select f_max when there is no idle time.
+ *
+ * NOTE: numerical errors or stop class might cause us to not quite hit
+ * saturation when we should -- something for later.
+ */
+ if (util + dl_util >= max)
+ return max;
+
+ /*
+ * OTOH, for energy computation we need the estimated running time, so
+ * include util_dl and ignore dl_bw.
+ */
+ if (type == ENERGY_UTIL)
+ util += dl_util;
+
+ /*
+ * There is still idle time; further improve the number by using the
+ * irq metric. Because IRQ/steal time is hidden from the task clock we
+ * need to scale the task numbers:
+ *
+ * max - irq
+ * U' = irq + --------- * U
+ * max
+ */
+ util = scale_irq_capacity(util, irq, max);
+ util += irq;
+
+ /*
+ * Bandwidth required by DEADLINE must always be granted while, for
+ * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
+ * to gracefully reduce the frequency when no tasks show up for longer
+ * periods of time.
+ *
+ * Ideally we would like to set bw_dl as min/guaranteed freq and util +
+ * bw_dl as requested freq. However, cpufreq is not yet ready for such
+ * an interface. So, we only do the latter for now.
+ */
+ if (type == FREQUENCY_UTIL)
+ util += cpu_bw_dl(rq);
+
+ return min(max, util);
+}
+
+unsigned long sched_cpu_util(int cpu, unsigned long max)
+{
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
+ ENERGY_UTIL, NULL);
+}
+#endif /* CONFIG_SMP */
+
/**
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
/*
* Both the cpu-hotplug and stop task are in this case and are
* required to complete the hotplug process.
+ *
+ * XXX: the idle task does not match kthread_is_per_cpu() due to
+ * histerical raisins.
*/
- if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
+ if (rq->idle == push_task ||
+ ((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) ||
+ is_migration_disabled(push_task)) {
+
/*
* If this is the idle task on the outgoing CPU try to wake
* up the hotplug control thread which might wait for the
/*
* At this point need_resched() is true and we'll take the loop in
* schedule(). The next pick is obviously going to be the stop task
- * which is_per_cpu_kthread() and will push this task away.
+ * which kthread_is_per_cpu() and will push this task away.
*/
raw_spin_lock(&rq->lock);
}
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
- if (on)
+ rq->balance_push = on;
+ if (on) {
+ WARN_ON_ONCE(rq->balance_callback);
rq->balance_callback = &balance_push_callback;
- else
+ } else if (rq->balance_callback == &balance_push_callback) {
rq->balance_callback = NULL;
+ }
rq_unlock_irqrestore(rq, &rf);
}
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
+ /*
+ * Make sure that when the hotplug state machine does a roll-back
+ * we clear balance_push. Ideally that would happen earlier...
+ */
balance_push_set(cpu, false);
#ifdef CONFIG_SCHED_SMT
struct rq_flags rf;
int ret;
+ /*
+ * Remove CPU from nohz.idle_cpus_mask to prevent participating in
+ * load balancing when not active
+ */
+ nohz_balance_exit_idle(rq);
+
set_cpu_active(cpu, false);
+
+ /*
+ * From this point forward, this CPU will refuse to run any task that
+ * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
+ * push those tasks away until this gets cleared, see
+ * sched_cpu_dying().
+ */
+ balance_push_set(cpu, true);
+
/*
- * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
- * users of this state to go away such that all new such users will
- * observe it.
+ * We've cleared cpu_active_mask / set balance_push, wait for all
+ * preempt-disabled and RCU users of this state to go away such that
+ * all new such users will observe it.
+ *
+ * Specifically, we rely on ttwu to no longer target this CPU, see
+ * ttwu_queue_cond() and is_cpu_allowed().
*
* Do sync before park smpboot threads to take care the rcu boost case.
*/
synchronize_rcu();
- balance_push_set(cpu, true);
-
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
update_rq_clock(rq);
atomic_long_add(delta, &calc_load_tasks);
}
+ static void dump_rq_tasks(struct rq *rq, const char *loglvl)
+ {
+ struct task_struct *g, *p;
+ int cpu = cpu_of(rq);
+
+ lockdep_assert_held(&rq->lock);
+
+ printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
+ for_each_process_thread(g, p) {
+ if (task_cpu(p) != cpu)
+ continue;
+
+ if (!task_on_rq_queued(p))
+ continue;
+
+ printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
+ }
+ }
+
int sched_cpu_dying(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
- BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
+ if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
+ WARN(true, "Dying CPU not properly vacated!");
+ dump_rq_tasks(rq, KERN_WARNING);
+ }
rq_unlock_irqrestore(rq, &rf);
+ /*
+ * Now that the CPU is offline, make sure we're welcome
+ * to new tasks once we come back up.
+ */
+ balance_push_set(cpu, false);
+
calc_load_migrate(rq);
update_max_interval();
- nohz_balance_exit_idle(rq);
hrtick_clear(rq);
return 0;
}