1 // SPDX-License-Identifier: GPL-2.0-only
5 * Core kernel scheduler code and related syscalls
7 * Copyright (C) 1991-2002 Linus Torvalds
9 #define CREATE_TRACE_POINTS
10 #include <trace/events/sched.h>
11 #undef CREATE_TRACE_POINTS
15 #include <linux/nospec.h>
17 #include <linux/kcov.h>
18 #include <linux/scs.h>
20 #include <asm/switch_to.h>
23 #include "../workqueue_internal.h"
24 #include "../../fs/io-wq.h"
25 #include "../smpboot.h"
31 * Export tracepoints that act as a bare tracehook (ie: have no trace event
32 * associated with them) to allow external modules to probe them.
34 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
35 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
36 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
37 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
38 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
39 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
40 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
41 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
42 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
43 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
45 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
47 #ifdef CONFIG_SCHED_DEBUG
49 * Debugging: various feature bits
51 * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
52 * sysctl_sched_features, defined in sched.h, to allow constants propagation
53 * at compile time and compiler optimization based on features default.
55 #define SCHED_FEAT(name, enabled) \
56 (1UL << __SCHED_FEAT_##name) * enabled |
57 const_debug unsigned int sysctl_sched_features =
63 * Print a warning if need_resched is set for the given duration (if
64 * LATENCY_WARN is enabled).
66 * If sysctl_resched_latency_warn_once is set, only one warning will be shown
69 __read_mostly int sysctl_resched_latency_warn_ms = 100;
70 __read_mostly int sysctl_resched_latency_warn_once = 1;
71 #endif /* CONFIG_SCHED_DEBUG */
74 * Number of tasks to iterate in a single balance run.
75 * Limited because this is done with IRQs disabled.
77 const_debug unsigned int sysctl_sched_nr_migrate = 32;
80 * period over which we measure -rt task CPU usage in us.
83 unsigned int sysctl_sched_rt_period = 1000000;
85 __read_mostly int scheduler_running;
87 #ifdef CONFIG_SCHED_CORE
89 DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
91 /* kernel prio, less is more */
92 static inline int __task_prio(struct task_struct *p)
94 if (p->sched_class == &stop_sched_class) /* trumps deadline */
97 if (rt_prio(p->prio)) /* includes deadline */
98 return p->prio; /* [-1, 99] */
100 if (p->sched_class == &idle_sched_class)
101 return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
103 return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
113 /* real prio, less is less */
114 static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
117 int pa = __task_prio(a), pb = __task_prio(b);
125 if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
126 return !dl_time_before(a->dl.deadline, b->dl.deadline);
128 if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
129 return cfs_prio_less(a, b, in_fi);
134 static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
136 if (a->core_cookie < b->core_cookie)
139 if (a->core_cookie > b->core_cookie)
142 /* flip prio, so high prio is leftmost */
143 if (prio_less(b, a, task_rq(a)->core->core_forceidle))
149 #define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
151 static inline bool rb_sched_core_less(struct rb_node *a, const struct rb_node *b)
153 return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
156 static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
158 const struct task_struct *p = __node_2_sc(node);
159 unsigned long cookie = (unsigned long)key;
161 if (cookie < p->core_cookie)
164 if (cookie > p->core_cookie)
170 void sched_core_enqueue(struct rq *rq, struct task_struct *p)
172 rq->core->core_task_seq++;
177 rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
180 void sched_core_dequeue(struct rq *rq, struct task_struct *p)
182 rq->core->core_task_seq++;
184 if (!sched_core_enqueued(p))
187 rb_erase(&p->core_node, &rq->core_tree);
188 RB_CLEAR_NODE(&p->core_node);
192 * Find left-most (aka, highest priority) task matching @cookie.
194 static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
196 struct rb_node *node;
198 node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
200 * The idle task always matches any cookie!
203 return idle_sched_class.pick_task(rq);
205 return __node_2_sc(node);
208 static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
210 struct rb_node *node = &p->core_node;
212 node = rb_next(node);
216 p = container_of(node, struct task_struct, core_node);
217 if (p->core_cookie != cookie)
224 * Magic required such that:
226 * raw_spin_rq_lock(rq);
228 * raw_spin_rq_unlock(rq);
230 * ends up locking and unlocking the _same_ lock, and all CPUs
231 * always agree on what rq has what lock.
233 * XXX entirely possible to selectively enable cores, don't bother for now.
236 static DEFINE_MUTEX(sched_core_mutex);
237 static atomic_t sched_core_count;
238 static struct cpumask sched_core_mask;
240 static void sched_core_lock(int cpu, unsigned long *flags)
242 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
245 local_irq_save(*flags);
246 for_each_cpu(t, smt_mask)
247 raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
250 static void sched_core_unlock(int cpu, unsigned long *flags)
252 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
255 for_each_cpu(t, smt_mask)
256 raw_spin_unlock(&cpu_rq(t)->__lock);
257 local_irq_restore(*flags);
260 static void __sched_core_flip(bool enabled)
268 * Toggle the online cores, one by one.
270 cpumask_copy(&sched_core_mask, cpu_online_mask);
271 for_each_cpu(cpu, &sched_core_mask) {
272 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
274 sched_core_lock(cpu, &flags);
276 for_each_cpu(t, smt_mask)
277 cpu_rq(t)->core_enabled = enabled;
279 sched_core_unlock(cpu, &flags);
281 cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
285 * Toggle the offline CPUs.
287 cpumask_copy(&sched_core_mask, cpu_possible_mask);
288 cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask);
290 for_each_cpu(cpu, &sched_core_mask)
291 cpu_rq(cpu)->core_enabled = enabled;
296 static void sched_core_assert_empty(void)
300 for_each_possible_cpu(cpu)
301 WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
304 static void __sched_core_enable(void)
306 static_branch_enable(&__sched_core_enabled);
308 * Ensure all previous instances of raw_spin_rq_*lock() have finished
309 * and future ones will observe !sched_core_disabled().
312 __sched_core_flip(true);
313 sched_core_assert_empty();
316 static void __sched_core_disable(void)
318 sched_core_assert_empty();
319 __sched_core_flip(false);
320 static_branch_disable(&__sched_core_enabled);
323 void sched_core_get(void)
325 if (atomic_inc_not_zero(&sched_core_count))
328 mutex_lock(&sched_core_mutex);
329 if (!atomic_read(&sched_core_count))
330 __sched_core_enable();
332 smp_mb__before_atomic();
333 atomic_inc(&sched_core_count);
334 mutex_unlock(&sched_core_mutex);
337 static void __sched_core_put(struct work_struct *work)
339 if (atomic_dec_and_mutex_lock(&sched_core_count, &sched_core_mutex)) {
340 __sched_core_disable();
341 mutex_unlock(&sched_core_mutex);
345 void sched_core_put(void)
347 static DECLARE_WORK(_work, __sched_core_put);
350 * "There can be only one"
352 * Either this is the last one, or we don't actually need to do any
353 * 'work'. If it is the last *again*, we rely on
354 * WORK_STRUCT_PENDING_BIT.
356 if (!atomic_add_unless(&sched_core_count, -1, 1))
357 schedule_work(&_work);
360 #else /* !CONFIG_SCHED_CORE */
362 static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
363 static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
365 #endif /* CONFIG_SCHED_CORE */
368 * part of the period that we allow rt tasks to run in us.
371 int sysctl_sched_rt_runtime = 950000;
375 * Serialization rules:
381 * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
384 * rq2->lock where: rq1 < rq2
388 * Normal scheduling state is serialized by rq->lock. __schedule() takes the
389 * local CPU's rq->lock, it optionally removes the task from the runqueue and
390 * always looks at the local rq data structures to find the most eligible task
393 * Task enqueue is also under rq->lock, possibly taken from another CPU.
394 * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
395 * the local CPU to avoid bouncing the runqueue state around [ see
396 * ttwu_queue_wakelist() ]
398 * Task wakeup, specifically wakeups that involve migration, are horribly
399 * complicated to avoid having to take two rq->locks.
403 * System-calls and anything external will use task_rq_lock() which acquires
404 * both p->pi_lock and rq->lock. As a consequence the state they change is
405 * stable while holding either lock:
407 * - sched_setaffinity()/
408 * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
409 * - set_user_nice(): p->se.load, p->*prio
410 * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
411 * p->se.load, p->rt_priority,
412 * p->dl.dl_{runtime, deadline, period, flags, bw, density}
413 * - sched_setnuma(): p->numa_preferred_nid
414 * - sched_move_task()/
415 * cpu_cgroup_fork(): p->sched_task_group
416 * - uclamp_update_active() p->uclamp*
418 * p->state <- TASK_*:
420 * is changed locklessly using set_current_state(), __set_current_state() or
421 * set_special_state(), see their respective comments, or by
422 * try_to_wake_up(). This latter uses p->pi_lock to serialize against
425 * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
427 * is set by activate_task() and cleared by deactivate_task(), under
428 * rq->lock. Non-zero indicates the task is runnable, the special
429 * ON_RQ_MIGRATING state is used for migration without holding both
430 * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
432 * p->on_cpu <- { 0, 1 }:
434 * is set by prepare_task() and cleared by finish_task() such that it will be
435 * set before p is scheduled-in and cleared after p is scheduled-out, both
436 * under rq->lock. Non-zero indicates the task is running on its CPU.
438 * [ The astute reader will observe that it is possible for two tasks on one
439 * CPU to have ->on_cpu = 1 at the same time. ]
441 * task_cpu(p): is changed by set_task_cpu(), the rules are:
443 * - Don't call set_task_cpu() on a blocked task:
445 * We don't care what CPU we're not running on, this simplifies hotplug,
446 * the CPU assignment of blocked tasks isn't required to be valid.
448 * - for try_to_wake_up(), called under p->pi_lock:
450 * This allows try_to_wake_up() to only take one rq->lock, see its comment.
452 * - for migration called under rq->lock:
453 * [ see task_on_rq_migrating() in task_rq_lock() ]
455 * o move_queued_task()
458 * - for migration called under double_rq_lock():
460 * o __migrate_swap_task()
461 * o push_rt_task() / pull_rt_task()
462 * o push_dl_task() / pull_dl_task()
463 * o dl_task_offline_migration()
467 void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
469 raw_spinlock_t *lock;
471 /* Matches synchronize_rcu() in __sched_core_enable() */
473 if (sched_core_disabled()) {
474 raw_spin_lock_nested(&rq->__lock, subclass);
475 /* preempt_count *MUST* be > 1 */
476 preempt_enable_no_resched();
481 lock = __rq_lockp(rq);
482 raw_spin_lock_nested(lock, subclass);
483 if (likely(lock == __rq_lockp(rq))) {
484 /* preempt_count *MUST* be > 1 */
485 preempt_enable_no_resched();
488 raw_spin_unlock(lock);
492 bool raw_spin_rq_trylock(struct rq *rq)
494 raw_spinlock_t *lock;
497 /* Matches synchronize_rcu() in __sched_core_enable() */
499 if (sched_core_disabled()) {
500 ret = raw_spin_trylock(&rq->__lock);
506 lock = __rq_lockp(rq);
507 ret = raw_spin_trylock(lock);
508 if (!ret || (likely(lock == __rq_lockp(rq)))) {
512 raw_spin_unlock(lock);
516 void raw_spin_rq_unlock(struct rq *rq)
518 raw_spin_unlock(rq_lockp(rq));
523 * double_rq_lock - safely lock two runqueues
525 void double_rq_lock(struct rq *rq1, struct rq *rq2)
527 lockdep_assert_irqs_disabled();
529 if (rq_order_less(rq2, rq1))
532 raw_spin_rq_lock(rq1);
533 if (__rq_lockp(rq1) == __rq_lockp(rq2))
536 raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
541 * __task_rq_lock - lock the rq @p resides on.
543 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
548 lockdep_assert_held(&p->pi_lock);
552 raw_spin_rq_lock(rq);
553 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
557 raw_spin_rq_unlock(rq);
559 while (unlikely(task_on_rq_migrating(p)))
565 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
567 struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
568 __acquires(p->pi_lock)
574 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
576 raw_spin_rq_lock(rq);
578 * move_queued_task() task_rq_lock()
581 * [S] ->on_rq = MIGRATING [L] rq = task_rq()
582 * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
583 * [S] ->cpu = new_cpu [L] task_rq()
587 * If we observe the old CPU in task_rq_lock(), the acquire of
588 * the old rq->lock will fully serialize against the stores.
590 * If we observe the new CPU in task_rq_lock(), the address
591 * dependency headed by '[L] rq = task_rq()' and the acquire
592 * will pair with the WMB to ensure we then also see migrating.
594 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
598 raw_spin_rq_unlock(rq);
599 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
601 while (unlikely(task_on_rq_migrating(p)))
607 * RQ-clock updating methods:
610 static void update_rq_clock_task(struct rq *rq, s64 delta)
613 * In theory, the compile should just see 0 here, and optimize out the call
614 * to sched_rt_avg_update. But I don't trust it...
616 s64 __maybe_unused steal = 0, irq_delta = 0;
618 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
619 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
622 * Since irq_time is only updated on {soft,}irq_exit, we might run into
623 * this case when a previous update_rq_clock() happened inside a
626 * When this happens, we stop ->clock_task and only update the
627 * prev_irq_time stamp to account for the part that fit, so that a next
628 * update will consume the rest. This ensures ->clock_task is
631 * It does however cause some slight miss-attribution of {soft,}irq
632 * time, a more accurate solution would be to update the irq_time using
633 * the current rq->clock timestamp, except that would require using
636 if (irq_delta > delta)
639 rq->prev_irq_time += irq_delta;
642 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
643 if (static_key_false((¶virt_steal_rq_enabled))) {
644 steal = paravirt_steal_clock(cpu_of(rq));
645 steal -= rq->prev_steal_time_rq;
647 if (unlikely(steal > delta))
650 rq->prev_steal_time_rq += steal;
655 rq->clock_task += delta;
657 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
658 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
659 update_irq_load_avg(rq, irq_delta + steal);
661 update_rq_clock_pelt(rq, delta);
664 void update_rq_clock(struct rq *rq)
668 lockdep_assert_rq_held(rq);
670 if (rq->clock_update_flags & RQCF_ACT_SKIP)
673 #ifdef CONFIG_SCHED_DEBUG
674 if (sched_feat(WARN_DOUBLE_CLOCK))
675 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
676 rq->clock_update_flags |= RQCF_UPDATED;
679 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
683 update_rq_clock_task(rq, delta);
686 #ifdef CONFIG_SCHED_HRTICK
688 * Use HR-timers to deliver accurate preemption points.
691 static void hrtick_clear(struct rq *rq)
693 if (hrtimer_active(&rq->hrtick_timer))
694 hrtimer_cancel(&rq->hrtick_timer);
698 * High-resolution timer tick.
699 * Runs from hardirq context with interrupts disabled.
701 static enum hrtimer_restart hrtick(struct hrtimer *timer)
703 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
706 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
710 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
713 return HRTIMER_NORESTART;
718 static void __hrtick_restart(struct rq *rq)
720 struct hrtimer *timer = &rq->hrtick_timer;
721 ktime_t time = rq->hrtick_time;
723 hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
727 * called from hardirq (IPI) context
729 static void __hrtick_start(void *arg)
735 __hrtick_restart(rq);
740 * Called to set the hrtick timer state.
742 * called with rq->lock held and irqs disabled
744 void hrtick_start(struct rq *rq, u64 delay)
746 struct hrtimer *timer = &rq->hrtick_timer;
750 * Don't schedule slices shorter than 10000ns, that just
751 * doesn't make sense and can cause timer DoS.
753 delta = max_t(s64, delay, 10000LL);
754 rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
757 __hrtick_restart(rq);
759 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
764 * Called to set the hrtick timer state.
766 * called with rq->lock held and irqs disabled
768 void hrtick_start(struct rq *rq, u64 delay)
771 * Don't schedule slices shorter than 10000ns, that just
772 * doesn't make sense. Rely on vruntime for fairness.
774 delay = max_t(u64, delay, 10000LL);
775 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
776 HRTIMER_MODE_REL_PINNED_HARD);
779 #endif /* CONFIG_SMP */
781 static void hrtick_rq_init(struct rq *rq)
784 INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
786 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
787 rq->hrtick_timer.function = hrtick;
789 #else /* CONFIG_SCHED_HRTICK */
790 static inline void hrtick_clear(struct rq *rq)
794 static inline void hrtick_rq_init(struct rq *rq)
797 #endif /* CONFIG_SCHED_HRTICK */
800 * cmpxchg based fetch_or, macro so it works for different integer types
802 #define fetch_or(ptr, mask) \
804 typeof(ptr) _ptr = (ptr); \
805 typeof(mask) _mask = (mask); \
806 typeof(*_ptr) _old, _val = *_ptr; \
809 _old = cmpxchg(_ptr, _val, _val | _mask); \
817 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
819 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
820 * this avoids any races wrt polling state changes and thereby avoids
823 static bool set_nr_and_not_polling(struct task_struct *p)
825 struct thread_info *ti = task_thread_info(p);
826 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
830 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
832 * If this returns true, then the idle task promises to call
833 * sched_ttwu_pending() and reschedule soon.
835 static bool set_nr_if_polling(struct task_struct *p)
837 struct thread_info *ti = task_thread_info(p);
838 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
841 if (!(val & _TIF_POLLING_NRFLAG))
843 if (val & _TIF_NEED_RESCHED)
845 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
854 static bool set_nr_and_not_polling(struct task_struct *p)
856 set_tsk_need_resched(p);
861 static bool set_nr_if_polling(struct task_struct *p)
868 static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
870 struct wake_q_node *node = &task->wake_q;
873 * Atomically grab the task, if ->wake_q is !nil already it means
874 * it's already queued (either by us or someone else) and will get the
875 * wakeup due to that.
877 * In order to ensure that a pending wakeup will observe our pending
878 * state, even in the failed case, an explicit smp_mb() must be used.
880 smp_mb__before_atomic();
881 if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
885 * The head is context local, there can be no concurrency.
888 head->lastp = &node->next;
893 * wake_q_add() - queue a wakeup for 'later' waking.
894 * @head: the wake_q_head to add @task to
895 * @task: the task to queue for 'later' wakeup
897 * Queue a task for later wakeup, most likely by the wake_up_q() call in the
898 * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
901 * This function must be used as-if it were wake_up_process(); IOW the task
902 * must be ready to be woken at this location.
904 void wake_q_add(struct wake_q_head *head, struct task_struct *task)
906 if (__wake_q_add(head, task))
907 get_task_struct(task);
911 * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
912 * @head: the wake_q_head to add @task to
913 * @task: the task to queue for 'later' wakeup
915 * Queue a task for later wakeup, most likely by the wake_up_q() call in the
916 * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
919 * This function must be used as-if it were wake_up_process(); IOW the task
920 * must be ready to be woken at this location.
922 * This function is essentially a task-safe equivalent to wake_q_add(). Callers
923 * that already hold reference to @task can call the 'safe' version and trust
924 * wake_q to do the right thing depending whether or not the @task is already
927 void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
929 if (!__wake_q_add(head, task))
930 put_task_struct(task);
933 void wake_up_q(struct wake_q_head *head)
935 struct wake_q_node *node = head->first;
937 while (node != WAKE_Q_TAIL) {
938 struct task_struct *task;
940 task = container_of(node, struct task_struct, wake_q);
941 /* Task can safely be re-inserted now: */
943 task->wake_q.next = NULL;
946 * wake_up_process() executes a full barrier, which pairs with
947 * the queueing in wake_q_add() so as not to miss wakeups.
949 wake_up_process(task);
950 put_task_struct(task);
955 * resched_curr - mark rq's current task 'to be rescheduled now'.
957 * On UP this means the setting of the need_resched flag, on SMP it
958 * might also involve a cross-CPU call to trigger the scheduler on
961 void resched_curr(struct rq *rq)
963 struct task_struct *curr = rq->curr;
966 lockdep_assert_rq_held(rq);
968 if (test_tsk_need_resched(curr))
973 if (cpu == smp_processor_id()) {
974 set_tsk_need_resched(curr);
975 set_preempt_need_resched();
979 if (set_nr_and_not_polling(curr))
980 smp_send_reschedule(cpu);
982 trace_sched_wake_idle_without_ipi(cpu);
985 void resched_cpu(int cpu)
987 struct rq *rq = cpu_rq(cpu);
990 raw_spin_rq_lock_irqsave(rq, flags);
991 if (cpu_online(cpu) || cpu == smp_processor_id())
993 raw_spin_rq_unlock_irqrestore(rq, flags);
997 #ifdef CONFIG_NO_HZ_COMMON
999 * In the semi idle case, use the nearest busy CPU for migrating timers
1000 * from an idle CPU. This is good for power-savings.
1002 * We don't do similar optimization for completely idle system, as
1003 * selecting an idle CPU will add more delays to the timers than intended
1004 * (as that CPU's timer base may not be uptodate wrt jiffies etc).
1006 int get_nohz_timer_target(void)
1008 int i, cpu = smp_processor_id(), default_cpu = -1;
1009 struct sched_domain *sd;
1011 if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
1018 for_each_domain(cpu, sd) {
1019 for_each_cpu_and(i, sched_domain_span(sd),
1020 housekeeping_cpumask(HK_FLAG_TIMER)) {
1031 if (default_cpu == -1)
1032 default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
1040 * When add_timer_on() enqueues a timer into the timer wheel of an
1041 * idle CPU then this timer might expire before the next timer event
1042 * which is scheduled to wake up that CPU. In case of a completely
1043 * idle system the next event might even be infinite time into the
1044 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1045 * leaves the inner idle loop so the newly added timer is taken into
1046 * account when the CPU goes back to idle and evaluates the timer
1047 * wheel for the next timer event.
1049 static void wake_up_idle_cpu(int cpu)
1051 struct rq *rq = cpu_rq(cpu);
1053 if (cpu == smp_processor_id())
1056 if (set_nr_and_not_polling(rq->idle))
1057 smp_send_reschedule(cpu);
1059 trace_sched_wake_idle_without_ipi(cpu);
1062 static bool wake_up_full_nohz_cpu(int cpu)
1065 * We just need the target to call irq_exit() and re-evaluate
1066 * the next tick. The nohz full kick at least implies that.
1067 * If needed we can still optimize that later with an
1070 if (cpu_is_offline(cpu))
1071 return true; /* Don't try to wake offline CPUs. */
1072 if (tick_nohz_full_cpu(cpu)) {
1073 if (cpu != smp_processor_id() ||
1074 tick_nohz_tick_stopped())
1075 tick_nohz_full_kick_cpu(cpu);
1083 * Wake up the specified CPU. If the CPU is going offline, it is the
1084 * caller's responsibility to deal with the lost wakeup, for example,
1085 * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
1087 void wake_up_nohz_cpu(int cpu)
1089 if (!wake_up_full_nohz_cpu(cpu))
1090 wake_up_idle_cpu(cpu);
1093 static void nohz_csd_func(void *info)
1095 struct rq *rq = info;
1096 int cpu = cpu_of(rq);
1100 * Release the rq::nohz_csd.
1102 flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu));
1103 WARN_ON(!(flags & NOHZ_KICK_MASK));
1105 rq->idle_balance = idle_cpu(cpu);
1106 if (rq->idle_balance && !need_resched()) {
1107 rq->nohz_idle_balance = flags;
1108 raise_softirq_irqoff(SCHED_SOFTIRQ);
1112 #endif /* CONFIG_NO_HZ_COMMON */
1114 #ifdef CONFIG_NO_HZ_FULL
1115 bool sched_can_stop_tick(struct rq *rq)
1117 int fifo_nr_running;
1119 /* Deadline tasks, even if single, need the tick */
1120 if (rq->dl.dl_nr_running)
1124 * If there are more than one RR tasks, we need the tick to affect the
1125 * actual RR behaviour.
1127 if (rq->rt.rr_nr_running) {
1128 if (rq->rt.rr_nr_running == 1)
1135 * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
1136 * forced preemption between FIFO tasks.
1138 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
1139 if (fifo_nr_running)
1143 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
1144 * if there's more than one we need the tick for involuntary
1147 if (rq->nr_running > 1)
1152 #endif /* CONFIG_NO_HZ_FULL */
1153 #endif /* CONFIG_SMP */
1155 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1156 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1158 * Iterate task_group tree rooted at *from, calling @down when first entering a
1159 * node and @up when leaving it for the final time.
1161 * Caller must hold rcu_lock or sufficient equivalent.
1163 int walk_tg_tree_from(struct task_group *from,
1164 tg_visitor down, tg_visitor up, void *data)
1166 struct task_group *parent, *child;
1172 ret = (*down)(parent, data);
1175 list_for_each_entry_rcu(child, &parent->children, siblings) {
1182 ret = (*up)(parent, data);
1183 if (ret || parent == from)
1187 parent = parent->parent;
1194 int tg_nop(struct task_group *tg, void *data)
1200 static void set_load_weight(struct task_struct *p, bool update_load)
1202 int prio = p->static_prio - MAX_RT_PRIO;
1203 struct load_weight *load = &p->se.load;
1206 * SCHED_IDLE tasks get minimal weight:
1208 if (task_has_idle_policy(p)) {
1209 load->weight = scale_load(WEIGHT_IDLEPRIO);
1210 load->inv_weight = WMULT_IDLEPRIO;
1215 * SCHED_OTHER tasks have to update their load when changing their
1218 if (update_load && p->sched_class == &fair_sched_class) {
1219 reweight_task(p, prio);
1221 load->weight = scale_load(sched_prio_to_weight[prio]);
1222 load->inv_weight = sched_prio_to_wmult[prio];
1226 #ifdef CONFIG_UCLAMP_TASK
1228 * Serializes updates of utilization clamp values
1230 * The (slow-path) user-space triggers utilization clamp value updates which
1231 * can require updates on (fast-path) scheduler's data structures used to
1232 * support enqueue/dequeue operations.
1233 * While the per-CPU rq lock protects fast-path update operations, user-space
1234 * requests are serialized using a mutex to reduce the risk of conflicting
1235 * updates or API abuses.
1237 static DEFINE_MUTEX(uclamp_mutex);
1239 /* Max allowed minimum utilization */
1240 unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
1242 /* Max allowed maximum utilization */
1243 unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
1246 * By default RT tasks run at the maximum performance point/capacity of the
1247 * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
1248 * SCHED_CAPACITY_SCALE.
1250 * This knob allows admins to change the default behavior when uclamp is being
1251 * used. In battery powered devices, particularly, running at the maximum
1252 * capacity and frequency will increase energy consumption and shorten the
1255 * This knob only affects RT tasks that their uclamp_se->user_defined == false.
1257 * This knob will not override the system default sched_util_clamp_min defined
1260 unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
1262 /* All clamps are required to be less or equal than these values */
1263 static struct uclamp_se uclamp_default[UCLAMP_CNT];
1266 * This static key is used to reduce the uclamp overhead in the fast path. It
1267 * primarily disables the call to uclamp_rq_{inc, dec}() in
1268 * enqueue/dequeue_task().
1270 * This allows users to continue to enable uclamp in their kernel config with
1271 * minimum uclamp overhead in the fast path.
1273 * As soon as userspace modifies any of the uclamp knobs, the static key is
1274 * enabled, since we have an actual users that make use of uclamp
1277 * The knobs that would enable this static key are:
1279 * * A task modifying its uclamp value with sched_setattr().
1280 * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
1281 * * An admin modifying the cgroup cpu.uclamp.{min, max}
1283 DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
1285 /* Integer rounded range for each bucket */
1286 #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
1288 #define for_each_clamp_id(clamp_id) \
1289 for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
1291 static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
1293 return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
1296 static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
1298 if (clamp_id == UCLAMP_MIN)
1300 return SCHED_CAPACITY_SCALE;
1303 static inline void uclamp_se_set(struct uclamp_se *uc_se,
1304 unsigned int value, bool user_defined)
1306 uc_se->value = value;
1307 uc_se->bucket_id = uclamp_bucket_id(value);
1308 uc_se->user_defined = user_defined;
1311 static inline unsigned int
1312 uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
1313 unsigned int clamp_value)
1316 * Avoid blocked utilization pushing up the frequency when we go
1317 * idle (which drops the max-clamp) by retaining the last known
1320 if (clamp_id == UCLAMP_MAX) {
1321 rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
1325 return uclamp_none(UCLAMP_MIN);
1328 static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
1329 unsigned int clamp_value)
1331 /* Reset max-clamp retention only on idle exit */
1332 if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1335 WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
1339 unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
1340 unsigned int clamp_value)
1342 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
1343 int bucket_id = UCLAMP_BUCKETS - 1;
1346 * Since both min and max clamps are max aggregated, find the
1347 * top most bucket with tasks in.
1349 for ( ; bucket_id >= 0; bucket_id--) {
1350 if (!bucket[bucket_id].tasks)
1352 return bucket[bucket_id].value;
1355 /* No tasks -- default clamp values */
1356 return uclamp_idle_value(rq, clamp_id, clamp_value);
1359 static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1361 unsigned int default_util_min;
1362 struct uclamp_se *uc_se;
1364 lockdep_assert_held(&p->pi_lock);
1366 uc_se = &p->uclamp_req[UCLAMP_MIN];
1368 /* Only sync if user didn't override the default */
1369 if (uc_se->user_defined)
1372 default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1373 uclamp_se_set(uc_se, default_util_min, false);
1376 static void uclamp_update_util_min_rt_default(struct task_struct *p)
1384 /* Protect updates to p->uclamp_* */
1385 rq = task_rq_lock(p, &rf);
1386 __uclamp_update_util_min_rt_default(p);
1387 task_rq_unlock(rq, p, &rf);
1390 static void uclamp_sync_util_min_rt_default(void)
1392 struct task_struct *g, *p;
1395 * copy_process() sysctl_uclamp
1396 * uclamp_min_rt = X;
1397 * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
1398 * // link thread smp_mb__after_spinlock()
1399 * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
1400 * sched_post_fork() for_each_process_thread()
1401 * __uclamp_sync_rt() __uclamp_sync_rt()
1403 * Ensures that either sched_post_fork() will observe the new
1404 * uclamp_min_rt or for_each_process_thread() will observe the new
1407 read_lock(&tasklist_lock);
1408 smp_mb__after_spinlock();
1409 read_unlock(&tasklist_lock);
1412 for_each_process_thread(g, p)
1413 uclamp_update_util_min_rt_default(p);
1417 static inline struct uclamp_se
1418 uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
1420 /* Copy by value as we could modify it */
1421 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
1422 #ifdef CONFIG_UCLAMP_TASK_GROUP
1423 unsigned int tg_min, tg_max, value;
1426 * Tasks in autogroups or root task group will be
1427 * restricted by system defaults.
1429 if (task_group_is_autogroup(task_group(p)))
1431 if (task_group(p) == &root_task_group)
1434 tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
1435 tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
1436 value = uc_req.value;
1437 value = clamp(value, tg_min, tg_max);
1438 uclamp_se_set(&uc_req, value, false);
1445 * The effective clamp bucket index of a task depends on, by increasing
1447 * - the task specific clamp value, when explicitly requested from userspace
1448 * - the task group effective clamp value, for tasks not either in the root
1449 * group or in an autogroup
1450 * - the system default clamp value, defined by the sysadmin
1452 static inline struct uclamp_se
1453 uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
1455 struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
1456 struct uclamp_se uc_max = uclamp_default[clamp_id];
1458 /* System default restrictions always apply */
1459 if (unlikely(uc_req.value > uc_max.value))
1465 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
1467 struct uclamp_se uc_eff;
1469 /* Task currently refcounted: use back-annotated (effective) value */
1470 if (p->uclamp[clamp_id].active)
1471 return (unsigned long)p->uclamp[clamp_id].value;
1473 uc_eff = uclamp_eff_get(p, clamp_id);
1475 return (unsigned long)uc_eff.value;
1479 * When a task is enqueued on a rq, the clamp bucket currently defined by the
1480 * task's uclamp::bucket_id is refcounted on that rq. This also immediately
1481 * updates the rq's clamp value if required.
1483 * Tasks can have a task-specific value requested from user-space, track
1484 * within each bucket the maximum value for tasks refcounted in it.
1485 * This "local max aggregation" allows to track the exact "requested" value
1486 * for each bucket when all its RUNNABLE tasks require the same clamp.
1488 static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
1489 enum uclamp_id clamp_id)
1491 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1492 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1493 struct uclamp_bucket *bucket;
1495 lockdep_assert_rq_held(rq);
1497 /* Update task effective clamp */
1498 p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
1500 bucket = &uc_rq->bucket[uc_se->bucket_id];
1502 uc_se->active = true;
1504 uclamp_idle_reset(rq, clamp_id, uc_se->value);
1507 * Local max aggregation: rq buckets always track the max
1508 * "requested" clamp value of its RUNNABLE tasks.
1510 if (bucket->tasks == 1 || uc_se->value > bucket->value)
1511 bucket->value = uc_se->value;
1513 if (uc_se->value > READ_ONCE(uc_rq->value))
1514 WRITE_ONCE(uc_rq->value, uc_se->value);
1518 * When a task is dequeued from a rq, the clamp bucket refcounted by the task
1519 * is released. If this is the last task reference counting the rq's max
1520 * active clamp value, then the rq's clamp value is updated.
1522 * Both refcounted tasks and rq's cached clamp values are expected to be
1523 * always valid. If it's detected they are not, as defensive programming,
1524 * enforce the expected state and warn.
1526 static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
1527 enum uclamp_id clamp_id)
1529 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1530 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1531 struct uclamp_bucket *bucket;
1532 unsigned int bkt_clamp;
1533 unsigned int rq_clamp;
1535 lockdep_assert_rq_held(rq);
1538 * If sched_uclamp_used was enabled after task @p was enqueued,
1539 * we could end up with unbalanced call to uclamp_rq_dec_id().
1541 * In this case the uc_se->active flag should be false since no uclamp
1542 * accounting was performed at enqueue time and we can just return
1545 * Need to be careful of the following enqueue/dequeue ordering
1549 * // sched_uclamp_used gets enabled
1552 * // Must not decrement bucket->tasks here
1555 * where we could end up with stale data in uc_se and
1556 * bucket[uc_se->bucket_id].
1558 * The following check here eliminates the possibility of such race.
1560 if (unlikely(!uc_se->active))
1563 bucket = &uc_rq->bucket[uc_se->bucket_id];
1565 SCHED_WARN_ON(!bucket->tasks);
1566 if (likely(bucket->tasks))
1569 uc_se->active = false;
1572 * Keep "local max aggregation" simple and accept to (possibly)
1573 * overboost some RUNNABLE tasks in the same bucket.
1574 * The rq clamp bucket value is reset to its base value whenever
1575 * there are no more RUNNABLE tasks refcounting it.
1577 if (likely(bucket->tasks))
1580 rq_clamp = READ_ONCE(uc_rq->value);
1582 * Defensive programming: this should never happen. If it happens,
1583 * e.g. due to future modification, warn and fixup the expected value.
1585 SCHED_WARN_ON(bucket->value > rq_clamp);
1586 if (bucket->value >= rq_clamp) {
1587 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
1588 WRITE_ONCE(uc_rq->value, bkt_clamp);
1592 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
1594 enum uclamp_id clamp_id;
1597 * Avoid any overhead until uclamp is actually used by the userspace.
1599 * The condition is constructed such that a NOP is generated when
1600 * sched_uclamp_used is disabled.
1602 if (!static_branch_unlikely(&sched_uclamp_used))
1605 if (unlikely(!p->sched_class->uclamp_enabled))
1608 for_each_clamp_id(clamp_id)
1609 uclamp_rq_inc_id(rq, p, clamp_id);
1611 /* Reset clamp idle holding when there is one RUNNABLE task */
1612 if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
1613 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1616 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
1618 enum uclamp_id clamp_id;
1621 * Avoid any overhead until uclamp is actually used by the userspace.
1623 * The condition is constructed such that a NOP is generated when
1624 * sched_uclamp_used is disabled.
1626 if (!static_branch_unlikely(&sched_uclamp_used))
1629 if (unlikely(!p->sched_class->uclamp_enabled))
1632 for_each_clamp_id(clamp_id)
1633 uclamp_rq_dec_id(rq, p, clamp_id);
1637 uclamp_update_active(struct task_struct *p)
1639 enum uclamp_id clamp_id;
1644 * Lock the task and the rq where the task is (or was) queued.
1646 * We might lock the (previous) rq of a !RUNNABLE task, but that's the
1647 * price to pay to safely serialize util_{min,max} updates with
1648 * enqueues, dequeues and migration operations.
1649 * This is the same locking schema used by __set_cpus_allowed_ptr().
1651 rq = task_rq_lock(p, &rf);
1654 * Setting the clamp bucket is serialized by task_rq_lock().
1655 * If the task is not yet RUNNABLE and its task_struct is not
1656 * affecting a valid clamp bucket, the next time it's enqueued,
1657 * it will already see the updated clamp bucket value.
1659 for_each_clamp_id(clamp_id) {
1660 if (p->uclamp[clamp_id].active) {
1661 uclamp_rq_dec_id(rq, p, clamp_id);
1662 uclamp_rq_inc_id(rq, p, clamp_id);
1666 task_rq_unlock(rq, p, &rf);
1669 #ifdef CONFIG_UCLAMP_TASK_GROUP
1671 uclamp_update_active_tasks(struct cgroup_subsys_state *css)
1673 struct css_task_iter it;
1674 struct task_struct *p;
1676 css_task_iter_start(css, 0, &it);
1677 while ((p = css_task_iter_next(&it)))
1678 uclamp_update_active(p);
1679 css_task_iter_end(&it);
1682 static void cpu_util_update_eff(struct cgroup_subsys_state *css);
1683 static void uclamp_update_root_tg(void)
1685 struct task_group *tg = &root_task_group;
1687 uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
1688 sysctl_sched_uclamp_util_min, false);
1689 uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
1690 sysctl_sched_uclamp_util_max, false);
1693 cpu_util_update_eff(&root_task_group.css);
1697 static void uclamp_update_root_tg(void) { }
1700 int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1701 void *buffer, size_t *lenp, loff_t *ppos)
1703 bool update_root_tg = false;
1704 int old_min, old_max, old_min_rt;
1707 mutex_lock(&uclamp_mutex);
1708 old_min = sysctl_sched_uclamp_util_min;
1709 old_max = sysctl_sched_uclamp_util_max;
1710 old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
1712 result = proc_dointvec(table, write, buffer, lenp, ppos);
1718 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1719 sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1720 sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1726 if (old_min != sysctl_sched_uclamp_util_min) {
1727 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
1728 sysctl_sched_uclamp_util_min, false);
1729 update_root_tg = true;
1731 if (old_max != sysctl_sched_uclamp_util_max) {
1732 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
1733 sysctl_sched_uclamp_util_max, false);
1734 update_root_tg = true;
1737 if (update_root_tg) {
1738 static_branch_enable(&sched_uclamp_used);
1739 uclamp_update_root_tg();
1742 if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1743 static_branch_enable(&sched_uclamp_used);
1744 uclamp_sync_util_min_rt_default();
1748 * We update all RUNNABLE tasks only when task groups are in use.
1749 * Otherwise, keep it simple and do just a lazy update at each next
1750 * task enqueue time.
1756 sysctl_sched_uclamp_util_min = old_min;
1757 sysctl_sched_uclamp_util_max = old_max;
1758 sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
1760 mutex_unlock(&uclamp_mutex);
1765 static int uclamp_validate(struct task_struct *p,
1766 const struct sched_attr *attr)
1768 int util_min = p->uclamp_req[UCLAMP_MIN].value;
1769 int util_max = p->uclamp_req[UCLAMP_MAX].value;
1771 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1772 util_min = attr->sched_util_min;
1774 if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
1778 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1779 util_max = attr->sched_util_max;
1781 if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
1785 if (util_min != -1 && util_max != -1 && util_min > util_max)
1789 * We have valid uclamp attributes; make sure uclamp is enabled.
1791 * We need to do that here, because enabling static branches is a
1792 * blocking operation which obviously cannot be done while holding
1795 static_branch_enable(&sched_uclamp_used);
1800 static bool uclamp_reset(const struct sched_attr *attr,
1801 enum uclamp_id clamp_id,
1802 struct uclamp_se *uc_se)
1804 /* Reset on sched class change for a non user-defined clamp value. */
1805 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
1806 !uc_se->user_defined)
1809 /* Reset on sched_util_{min,max} == -1. */
1810 if (clamp_id == UCLAMP_MIN &&
1811 attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1812 attr->sched_util_min == -1) {
1816 if (clamp_id == UCLAMP_MAX &&
1817 attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1818 attr->sched_util_max == -1) {
1825 static void __setscheduler_uclamp(struct task_struct *p,
1826 const struct sched_attr *attr)
1828 enum uclamp_id clamp_id;
1830 for_each_clamp_id(clamp_id) {
1831 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1834 if (!uclamp_reset(attr, clamp_id, uc_se))
1838 * RT by default have a 100% boost value that could be modified
1841 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1842 value = sysctl_sched_uclamp_util_min_rt_default;
1844 value = uclamp_none(clamp_id);
1846 uclamp_se_set(uc_se, value, false);
1850 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1853 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
1854 attr->sched_util_min != -1) {
1855 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1856 attr->sched_util_min, true);
1859 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
1860 attr->sched_util_max != -1) {
1861 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1862 attr->sched_util_max, true);
1866 static void uclamp_fork(struct task_struct *p)
1868 enum uclamp_id clamp_id;
1871 * We don't need to hold task_rq_lock() when updating p->uclamp_* here
1872 * as the task is still at its early fork stages.
1874 for_each_clamp_id(clamp_id)
1875 p->uclamp[clamp_id].active = false;
1877 if (likely(!p->sched_reset_on_fork))
1880 for_each_clamp_id(clamp_id) {
1881 uclamp_se_set(&p->uclamp_req[clamp_id],
1882 uclamp_none(clamp_id), false);
1886 static void uclamp_post_fork(struct task_struct *p)
1888 uclamp_update_util_min_rt_default(p);
1891 static void __init init_uclamp_rq(struct rq *rq)
1893 enum uclamp_id clamp_id;
1894 struct uclamp_rq *uc_rq = rq->uclamp;
1896 for_each_clamp_id(clamp_id) {
1897 uc_rq[clamp_id] = (struct uclamp_rq) {
1898 .value = uclamp_none(clamp_id)
1902 rq->uclamp_flags = 0;
1905 static void __init init_uclamp(void)
1907 struct uclamp_se uc_max = {};
1908 enum uclamp_id clamp_id;
1911 for_each_possible_cpu(cpu)
1912 init_uclamp_rq(cpu_rq(cpu));
1914 for_each_clamp_id(clamp_id) {
1915 uclamp_se_set(&init_task.uclamp_req[clamp_id],
1916 uclamp_none(clamp_id), false);
1919 /* System defaults allow max clamp values for both indexes */
1920 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
1921 for_each_clamp_id(clamp_id) {
1922 uclamp_default[clamp_id] = uc_max;
1923 #ifdef CONFIG_UCLAMP_TASK_GROUP
1924 root_task_group.uclamp_req[clamp_id] = uc_max;
1925 root_task_group.uclamp[clamp_id] = uc_max;
1930 #else /* CONFIG_UCLAMP_TASK */
1931 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
1932 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
1933 static inline int uclamp_validate(struct task_struct *p,
1934 const struct sched_attr *attr)
1938 static void __setscheduler_uclamp(struct task_struct *p,
1939 const struct sched_attr *attr) { }
1940 static inline void uclamp_fork(struct task_struct *p) { }
1941 static inline void uclamp_post_fork(struct task_struct *p) { }
1942 static inline void init_uclamp(void) { }
1943 #endif /* CONFIG_UCLAMP_TASK */
1945 bool sched_task_on_rq(struct task_struct *p)
1947 return task_on_rq_queued(p);
1950 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1952 if (!(flags & ENQUEUE_NOCLOCK))
1953 update_rq_clock(rq);
1955 if (!(flags & ENQUEUE_RESTORE)) {
1956 sched_info_enqueue(rq, p);
1957 psi_enqueue(p, flags & ENQUEUE_WAKEUP);
1960 uclamp_rq_inc(rq, p);
1961 p->sched_class->enqueue_task(rq, p, flags);
1963 if (sched_core_enabled(rq))
1964 sched_core_enqueue(rq, p);
1967 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1969 if (sched_core_enabled(rq))
1970 sched_core_dequeue(rq, p);
1972 if (!(flags & DEQUEUE_NOCLOCK))
1973 update_rq_clock(rq);
1975 if (!(flags & DEQUEUE_SAVE)) {
1976 sched_info_dequeue(rq, p);
1977 psi_dequeue(p, flags & DEQUEUE_SLEEP);
1980 uclamp_rq_dec(rq, p);
1981 p->sched_class->dequeue_task(rq, p, flags);
1984 void activate_task(struct rq *rq, struct task_struct *p, int flags)
1986 enqueue_task(rq, p, flags);
1988 p->on_rq = TASK_ON_RQ_QUEUED;
1991 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1993 p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
1995 dequeue_task(rq, p, flags);
1998 static inline int __normal_prio(int policy, int rt_prio, int nice)
2002 if (dl_policy(policy))
2003 prio = MAX_DL_PRIO - 1;
2004 else if (rt_policy(policy))
2005 prio = MAX_RT_PRIO - 1 - rt_prio;
2007 prio = NICE_TO_PRIO(nice);
2013 * Calculate the expected normal priority: i.e. priority
2014 * without taking RT-inheritance into account. Might be
2015 * boosted by interactivity modifiers. Changes upon fork,
2016 * setprio syscalls, and whenever the interactivity
2017 * estimator recalculates.
2019 static inline int normal_prio(struct task_struct *p)
2021 return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
2025 * Calculate the current priority, i.e. the priority
2026 * taken into account by the scheduler. This value might
2027 * be boosted by RT tasks, or might be boosted by
2028 * interactivity modifiers. Will be RT if the task got
2029 * RT-boosted. If not then it returns p->normal_prio.
2031 static int effective_prio(struct task_struct *p)
2033 p->normal_prio = normal_prio(p);
2035 * If we are RT tasks or we were boosted to RT priority,
2036 * keep the priority unchanged. Otherwise, update priority
2037 * to the normal priority:
2039 if (!rt_prio(p->prio))
2040 return p->normal_prio;
2045 * task_curr - is this task currently executing on a CPU?
2046 * @p: the task in question.
2048 * Return: 1 if the task is currently executing. 0 otherwise.
2050 inline int task_curr(const struct task_struct *p)
2052 return cpu_curr(task_cpu(p)) == p;
2056 * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
2057 * use the balance_callback list if you want balancing.
2059 * this means any call to check_class_changed() must be followed by a call to
2060 * balance_callback().
2062 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2063 const struct sched_class *prev_class,
2066 if (prev_class != p->sched_class) {
2067 if (prev_class->switched_from)
2068 prev_class->switched_from(rq, p);
2070 p->sched_class->switched_to(rq, p);
2071 } else if (oldprio != p->prio || dl_task(p))
2072 p->sched_class->prio_changed(rq, p, oldprio);
2075 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2077 if (p->sched_class == rq->curr->sched_class)
2078 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2079 else if (p->sched_class > rq->curr->sched_class)
2083 * A queue event has occurred, and we're going to schedule. In
2084 * this case, we can save a useless back to back clock update.
2086 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
2087 rq_clock_skip_update(rq);
2093 __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
2095 static int __set_cpus_allowed_ptr(struct task_struct *p,
2096 const struct cpumask *new_mask,
2099 static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
2101 if (likely(!p->migration_disabled))
2104 if (p->cpus_ptr != &p->cpus_mask)
2108 * Violates locking rules! see comment in __do_set_cpus_allowed().
2110 __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
2113 void migrate_disable(void)
2115 struct task_struct *p = current;
2117 if (p->migration_disabled) {
2118 p->migration_disabled++;
2123 this_rq()->nr_pinned++;
2124 p->migration_disabled = 1;
2127 EXPORT_SYMBOL_GPL(migrate_disable);
2129 void migrate_enable(void)
2131 struct task_struct *p = current;
2133 if (p->migration_disabled > 1) {
2134 p->migration_disabled--;
2139 * Ensure stop_task runs either before or after this, and that
2140 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
2143 if (p->cpus_ptr != &p->cpus_mask)
2144 __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
2146 * Mustn't clear migration_disabled() until cpus_ptr points back at the
2147 * regular cpus_mask, otherwise things that race (eg.
2148 * select_fallback_rq) get confused.
2151 p->migration_disabled = 0;
2152 this_rq()->nr_pinned--;
2155 EXPORT_SYMBOL_GPL(migrate_enable);
2157 static inline bool rq_has_pinned_tasks(struct rq *rq)
2159 return rq->nr_pinned;
2163 * Per-CPU kthreads are allowed to run on !active && online CPUs, see
2164 * __set_cpus_allowed_ptr() and select_fallback_rq().
2166 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
2168 /* When not in the task's cpumask, no point in looking further. */
2169 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
2172 /* migrate_disabled() must be allowed to finish. */
2173 if (is_migration_disabled(p))
2174 return cpu_online(cpu);
2176 /* Non kernel threads are not allowed during either online or offline. */
2177 if (!(p->flags & PF_KTHREAD))
2178 return cpu_active(cpu);
2180 /* KTHREAD_IS_PER_CPU is always allowed. */
2181 if (kthread_is_per_cpu(p))
2182 return cpu_online(cpu);
2184 /* Regular kernel threads don't get to stay during offline. */
2188 /* But are allowed during online. */
2189 return cpu_online(cpu);
2193 * This is how migration works:
2195 * 1) we invoke migration_cpu_stop() on the target CPU using
2197 * 2) stopper starts to run (implicitly forcing the migrated thread
2199 * 3) it checks whether the migrated task is still in the wrong runqueue.
2200 * 4) if it's in the wrong runqueue then the migration thread removes
2201 * it and puts it into the right queue.
2202 * 5) stopper completes and stop_one_cpu() returns and the migration
2207 * move_queued_task - move a queued task to new rq.
2209 * Returns (locked) new rq. Old rq's lock is released.
2211 static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
2212 struct task_struct *p, int new_cpu)
2214 lockdep_assert_rq_held(rq);
2216 deactivate_task(rq, p, DEQUEUE_NOCLOCK);
2217 set_task_cpu(p, new_cpu);
2220 rq = cpu_rq(new_cpu);
2223 BUG_ON(task_cpu(p) != new_cpu);
2224 activate_task(rq, p, 0);
2225 check_preempt_curr(rq, p, 0);
2230 struct migration_arg {
2231 struct task_struct *task;
2233 struct set_affinity_pending *pending;
2237 * @refs: number of wait_for_completion()
2238 * @stop_pending: is @stop_work in use
2240 struct set_affinity_pending {
2242 unsigned int stop_pending;
2243 struct completion done;
2244 struct cpu_stop_work stop_work;
2245 struct migration_arg arg;
2249 * Move (not current) task off this CPU, onto the destination CPU. We're doing
2250 * this because either it can't run here any more (set_cpus_allowed()
2251 * away from this CPU, or CPU going down), or because we're
2252 * attempting to rebalance this task on exec (sched_exec).
2254 * So we race with normal scheduler movements, but that's OK, as long
2255 * as the task is no longer on this CPU.
2257 static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
2258 struct task_struct *p, int dest_cpu)
2260 /* Affinity changed (again). */
2261 if (!is_cpu_allowed(p, dest_cpu))
2264 update_rq_clock(rq);
2265 rq = move_queued_task(rq, rf, p, dest_cpu);
2271 * migration_cpu_stop - this will be executed by a highprio stopper thread
2272 * and performs thread migration by bumping thread off CPU then
2273 * 'pushing' onto another runqueue.
2275 static int migration_cpu_stop(void *data)
2277 struct migration_arg *arg = data;
2278 struct set_affinity_pending *pending = arg->pending;
2279 struct task_struct *p = arg->task;
2280 struct rq *rq = this_rq();
2281 bool complete = false;
2285 * The original target CPU might have gone down and we might
2286 * be on another CPU but it doesn't matter.
2288 local_irq_save(rf.flags);
2290 * We need to explicitly wake pending tasks before running
2291 * __migrate_task() such that we will not miss enforcing cpus_ptr
2292 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
2294 flush_smp_call_function_from_idle();
2296 raw_spin_lock(&p->pi_lock);
2300 * If we were passed a pending, then ->stop_pending was set, thus
2301 * p->migration_pending must have remained stable.
2303 WARN_ON_ONCE(pending && pending != p->migration_pending);
2306 * If task_rq(p) != rq, it cannot be migrated here, because we're
2307 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
2308 * we're holding p->pi_lock.
2310 if (task_rq(p) == rq) {
2311 if (is_migration_disabled(p))
2315 p->migration_pending = NULL;
2318 if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
2322 if (task_on_rq_queued(p))
2323 rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
2325 p->wake_cpu = arg->dest_cpu;
2328 * XXX __migrate_task() can fail, at which point we might end
2329 * up running on a dodgy CPU, AFAICT this can only happen
2330 * during CPU hotplug, at which point we'll get pushed out
2331 * anyway, so it's probably not a big deal.
2334 } else if (pending) {
2336 * This happens when we get migrated between migrate_enable()'s
2337 * preempt_enable() and scheduling the stopper task. At that
2338 * point we're a regular task again and not current anymore.
2340 * A !PREEMPT kernel has a giant hole here, which makes it far
2345 * The task moved before the stopper got to run. We're holding
2346 * ->pi_lock, so the allowed mask is stable - if it got
2347 * somewhere allowed, we're done.
2349 if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
2350 p->migration_pending = NULL;
2356 * When migrate_enable() hits a rq mis-match we can't reliably
2357 * determine is_migration_disabled() and so have to chase after
2360 WARN_ON_ONCE(!pending->stop_pending);
2361 task_rq_unlock(rq, p, &rf);
2362 stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
2363 &pending->arg, &pending->stop_work);
2368 pending->stop_pending = false;
2369 task_rq_unlock(rq, p, &rf);
2372 complete_all(&pending->done);
2377 int push_cpu_stop(void *arg)
2379 struct rq *lowest_rq = NULL, *rq = this_rq();
2380 struct task_struct *p = arg;
2382 raw_spin_lock_irq(&p->pi_lock);
2383 raw_spin_rq_lock(rq);
2385 if (task_rq(p) != rq)
2388 if (is_migration_disabled(p)) {
2389 p->migration_flags |= MDF_PUSH;
2393 p->migration_flags &= ~MDF_PUSH;
2395 if (p->sched_class->find_lock_rq)
2396 lowest_rq = p->sched_class->find_lock_rq(p, rq);
2401 // XXX validate p is still the highest prio task
2402 if (task_rq(p) == rq) {
2403 deactivate_task(rq, p, 0);
2404 set_task_cpu(p, lowest_rq->cpu);
2405 activate_task(lowest_rq, p, 0);
2406 resched_curr(lowest_rq);
2409 double_unlock_balance(rq, lowest_rq);
2412 rq->push_busy = false;
2413 raw_spin_rq_unlock(rq);
2414 raw_spin_unlock_irq(&p->pi_lock);
2421 * sched_class::set_cpus_allowed must do the below, but is not required to
2422 * actually call this function.
2424 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2426 if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
2427 p->cpus_ptr = new_mask;
2431 cpumask_copy(&p->cpus_mask, new_mask);
2432 p->nr_cpus_allowed = cpumask_weight(new_mask);
2436 __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
2438 struct rq *rq = task_rq(p);
2439 bool queued, running;
2442 * This here violates the locking rules for affinity, since we're only
2443 * supposed to change these variables while holding both rq->lock and
2446 * HOWEVER, it magically works, because ttwu() is the only code that
2447 * accesses these variables under p->pi_lock and only does so after
2448 * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
2449 * before finish_task().
2451 * XXX do further audits, this smells like something putrid.
2453 if (flags & SCA_MIGRATE_DISABLE)
2454 SCHED_WARN_ON(!p->on_cpu);
2456 lockdep_assert_held(&p->pi_lock);
2458 queued = task_on_rq_queued(p);
2459 running = task_current(rq, p);
2463 * Because __kthread_bind() calls this on blocked tasks without
2466 lockdep_assert_rq_held(rq);
2467 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
2470 put_prev_task(rq, p);
2472 p->sched_class->set_cpus_allowed(p, new_mask, flags);
2475 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
2477 set_next_task(rq, p);
2480 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
2482 __do_set_cpus_allowed(p, new_mask, 0);
2486 * This function is wildly self concurrent; here be dragons.
2489 * When given a valid mask, __set_cpus_allowed_ptr() must block until the
2490 * designated task is enqueued on an allowed CPU. If that task is currently
2491 * running, we have to kick it out using the CPU stopper.
2493 * Migrate-Disable comes along and tramples all over our nice sandcastle.
2496 * Initial conditions: P0->cpus_mask = [0, 1]
2500 * migrate_disable();
2502 * set_cpus_allowed_ptr(P0, [1]);
2504 * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
2505 * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
2506 * This means we need the following scheme:
2510 * migrate_disable();
2512 * set_cpus_allowed_ptr(P0, [1]);
2516 * __set_cpus_allowed_ptr();
2517 * <wakes local stopper>
2518 * `--> <woken on migration completion>
2520 * Now the fun stuff: there may be several P1-like tasks, i.e. multiple
2521 * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
2522 * task p are serialized by p->pi_lock, which we can leverage: the one that
2523 * should come into effect at the end of the Migrate-Disable region is the last
2524 * one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
2525 * but we still need to properly signal those waiting tasks at the appropriate
2528 * This is implemented using struct set_affinity_pending. The first
2529 * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
2530 * setup an instance of that struct and install it on the targeted task_struct.
2531 * Any and all further callers will reuse that instance. Those then wait for
2532 * a completion signaled at the tail of the CPU stopper callback (1), triggered
2533 * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
2536 * (1) In the cases covered above. There is one more where the completion is
2537 * signaled within affine_move_task() itself: when a subsequent affinity request
2538 * occurs after the stopper bailed out due to the targeted task still being
2539 * Migrate-Disable. Consider:
2541 * Initial conditions: P0->cpus_mask = [0, 1]
2545 * migrate_disable();
2547 * set_cpus_allowed_ptr(P0, [1]);
2550 * migration_cpu_stop()
2551 * is_migration_disabled()
2553 * set_cpus_allowed_ptr(P0, [0, 1]);
2554 * <signal completion>
2557 * Note that the above is safe vs a concurrent migrate_enable(), as any
2558 * pending affinity completion is preceded by an uninstallation of
2559 * p->migration_pending done with p->pi_lock held.
2561 static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
2562 int dest_cpu, unsigned int flags)
2564 struct set_affinity_pending my_pending = { }, *pending = NULL;
2565 bool stop_pending, complete = false;
2567 /* Can the task run on the task's current CPU? If so, we're done */
2568 if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
2569 struct task_struct *push_task = NULL;
2571 if ((flags & SCA_MIGRATE_ENABLE) &&
2572 (p->migration_flags & MDF_PUSH) && !rq->push_busy) {
2573 rq->push_busy = true;
2574 push_task = get_task_struct(p);
2578 * If there are pending waiters, but no pending stop_work,
2579 * then complete now.
2581 pending = p->migration_pending;
2582 if (pending && !pending->stop_pending) {
2583 p->migration_pending = NULL;
2587 task_rq_unlock(rq, p, rf);
2590 stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2595 complete_all(&pending->done);
2600 if (!(flags & SCA_MIGRATE_ENABLE)) {
2601 /* serialized by p->pi_lock */
2602 if (!p->migration_pending) {
2603 /* Install the request */
2604 refcount_set(&my_pending.refs, 1);
2605 init_completion(&my_pending.done);
2606 my_pending.arg = (struct migration_arg) {
2608 .dest_cpu = dest_cpu,
2609 .pending = &my_pending,
2612 p->migration_pending = &my_pending;
2614 pending = p->migration_pending;
2615 refcount_inc(&pending->refs);
2617 * Affinity has changed, but we've already installed a
2618 * pending. migration_cpu_stop() *must* see this, else
2619 * we risk a completion of the pending despite having a
2620 * task on a disallowed CPU.
2622 * Serialized by p->pi_lock, so this is safe.
2624 pending->arg.dest_cpu = dest_cpu;
2627 pending = p->migration_pending;
2629 * - !MIGRATE_ENABLE:
2630 * we'll have installed a pending if there wasn't one already.
2633 * we're here because the current CPU isn't matching anymore,
2634 * the only way that can happen is because of a concurrent
2635 * set_cpus_allowed_ptr() call, which should then still be
2636 * pending completion.
2638 * Either way, we really should have a @pending here.
2640 if (WARN_ON_ONCE(!pending)) {
2641 task_rq_unlock(rq, p, rf);
2645 if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) {
2647 * MIGRATE_ENABLE gets here because 'p == current', but for
2648 * anything else we cannot do is_migration_disabled(), punt
2649 * and have the stopper function handle it all race-free.
2651 stop_pending = pending->stop_pending;
2653 pending->stop_pending = true;
2655 if (flags & SCA_MIGRATE_ENABLE)
2656 p->migration_flags &= ~MDF_PUSH;
2658 task_rq_unlock(rq, p, rf);
2660 if (!stop_pending) {
2661 stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
2662 &pending->arg, &pending->stop_work);
2665 if (flags & SCA_MIGRATE_ENABLE)
2669 if (!is_migration_disabled(p)) {
2670 if (task_on_rq_queued(p))
2671 rq = move_queued_task(rq, rf, p, dest_cpu);
2673 if (!pending->stop_pending) {
2674 p->migration_pending = NULL;
2678 task_rq_unlock(rq, p, rf);
2681 complete_all(&pending->done);
2684 wait_for_completion(&pending->done);
2686 if (refcount_dec_and_test(&pending->refs))
2687 wake_up_var(&pending->refs); /* No UaF, just an address */
2690 * Block the original owner of &pending until all subsequent callers
2691 * have seen the completion and decremented the refcount
2693 wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
2696 WARN_ON_ONCE(my_pending.stop_pending);
2702 * Change a given task's CPU affinity. Migrate the thread to a
2703 * proper CPU and schedule it away if the CPU it's executing on
2704 * is removed from the allowed bitmask.
2706 * NOTE: the caller must have a valid reference to the task, the
2707 * task must not exit() & deallocate itself prematurely. The
2708 * call is not atomic; no spinlocks may be held.
2710 static int __set_cpus_allowed_ptr(struct task_struct *p,
2711 const struct cpumask *new_mask,
2714 const struct cpumask *cpu_valid_mask = cpu_active_mask;
2715 unsigned int dest_cpu;
2720 rq = task_rq_lock(p, &rf);
2721 update_rq_clock(rq);
2723 if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
2725 * Kernel threads are allowed on online && !active CPUs,
2726 * however, during cpu-hot-unplug, even these might get pushed
2727 * away if not KTHREAD_IS_PER_CPU.
2729 * Specifically, migration_disabled() tasks must not fail the
2730 * cpumask_any_and_distribute() pick below, esp. so on
2731 * SCA_MIGRATE_ENABLE, otherwise we'll not call
2732 * set_cpus_allowed_common() and actually reset p->cpus_ptr.
2734 cpu_valid_mask = cpu_online_mask;
2738 * Must re-check here, to close a race against __kthread_bind(),
2739 * sched_setaffinity() is not guaranteed to observe the flag.
2741 if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
2746 if (!(flags & SCA_MIGRATE_ENABLE)) {
2747 if (cpumask_equal(&p->cpus_mask, new_mask))
2750 if (WARN_ON_ONCE(p == current &&
2751 is_migration_disabled(p) &&
2752 !cpumask_test_cpu(task_cpu(p), new_mask))) {
2759 * Picking a ~random cpu helps in cases where we are changing affinity
2760 * for groups of tasks (ie. cpuset), so that load balancing is not
2761 * immediately required to distribute the tasks within their new mask.
2763 dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
2764 if (dest_cpu >= nr_cpu_ids) {
2769 __do_set_cpus_allowed(p, new_mask, flags);
2771 return affine_move_task(rq, p, &rf, dest_cpu, flags);
2774 task_rq_unlock(rq, p, &rf);
2779 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
2781 return __set_cpus_allowed_ptr(p, new_mask, 0);
2783 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
2785 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2787 #ifdef CONFIG_SCHED_DEBUG
2788 unsigned int state = READ_ONCE(p->__state);
2791 * We should never call set_task_cpu() on a blocked task,
2792 * ttwu() will sort out the placement.
2794 WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
2797 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
2798 * because schedstat_wait_{start,end} rebase migrating task's wait_start
2799 * time relying on p->on_rq.
2801 WARN_ON_ONCE(state == TASK_RUNNING &&
2802 p->sched_class == &fair_sched_class &&
2803 (p->on_rq && !task_on_rq_migrating(p)));
2805 #ifdef CONFIG_LOCKDEP
2807 * The caller should hold either p->pi_lock or rq->lock, when changing
2808 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
2810 * sched_move_task() holds both and thus holding either pins the cgroup,
2813 * Furthermore, all task_rq users should acquire both locks, see
2816 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2817 lockdep_is_held(__rq_lockp(task_rq(p)))));
2820 * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
2822 WARN_ON_ONCE(!cpu_online(new_cpu));
2824 WARN_ON_ONCE(is_migration_disabled(p));
2827 trace_sched_migrate_task(p, new_cpu);
2829 if (task_cpu(p) != new_cpu) {
2830 if (p->sched_class->migrate_task_rq)
2831 p->sched_class->migrate_task_rq(p, new_cpu);
2832 p->se.nr_migrations++;
2834 perf_event_task_migrate(p);
2837 __set_task_cpu(p, new_cpu);
2840 #ifdef CONFIG_NUMA_BALANCING
2841 static void __migrate_swap_task(struct task_struct *p, int cpu)
2843 if (task_on_rq_queued(p)) {
2844 struct rq *src_rq, *dst_rq;
2845 struct rq_flags srf, drf;
2847 src_rq = task_rq(p);
2848 dst_rq = cpu_rq(cpu);
2850 rq_pin_lock(src_rq, &srf);
2851 rq_pin_lock(dst_rq, &drf);
2853 deactivate_task(src_rq, p, 0);
2854 set_task_cpu(p, cpu);
2855 activate_task(dst_rq, p, 0);
2856 check_preempt_curr(dst_rq, p, 0);
2858 rq_unpin_lock(dst_rq, &drf);
2859 rq_unpin_lock(src_rq, &srf);
2863 * Task isn't running anymore; make it appear like we migrated
2864 * it before it went to sleep. This means on wakeup we make the
2865 * previous CPU our target instead of where it really is.
2871 struct migration_swap_arg {
2872 struct task_struct *src_task, *dst_task;
2873 int src_cpu, dst_cpu;
2876 static int migrate_swap_stop(void *data)
2878 struct migration_swap_arg *arg = data;
2879 struct rq *src_rq, *dst_rq;
2882 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
2885 src_rq = cpu_rq(arg->src_cpu);
2886 dst_rq = cpu_rq(arg->dst_cpu);
2888 double_raw_lock(&arg->src_task->pi_lock,
2889 &arg->dst_task->pi_lock);
2890 double_rq_lock(src_rq, dst_rq);
2892 if (task_cpu(arg->dst_task) != arg->dst_cpu)
2895 if (task_cpu(arg->src_task) != arg->src_cpu)
2898 if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
2901 if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
2904 __migrate_swap_task(arg->src_task, arg->dst_cpu);
2905 __migrate_swap_task(arg->dst_task, arg->src_cpu);
2910 double_rq_unlock(src_rq, dst_rq);
2911 raw_spin_unlock(&arg->dst_task->pi_lock);
2912 raw_spin_unlock(&arg->src_task->pi_lock);
2918 * Cross migrate two tasks
2920 int migrate_swap(struct task_struct *cur, struct task_struct *p,
2921 int target_cpu, int curr_cpu)
2923 struct migration_swap_arg arg;
2926 arg = (struct migration_swap_arg){
2928 .src_cpu = curr_cpu,
2930 .dst_cpu = target_cpu,
2933 if (arg.src_cpu == arg.dst_cpu)
2937 * These three tests are all lockless; this is OK since all of them
2938 * will be re-checked with proper locks held further down the line.
2940 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
2943 if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
2946 if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
2949 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
2950 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
2955 #endif /* CONFIG_NUMA_BALANCING */
2958 * wait_task_inactive - wait for a thread to unschedule.
2960 * If @match_state is nonzero, it's the @p->state value just checked and
2961 * not expected to change. If it changes, i.e. @p might have woken up,
2962 * then return zero. When we succeed in waiting for @p to be off its CPU,
2963 * we return a positive number (its total switch count). If a second call
2964 * a short while later returns the same number, the caller can be sure that
2965 * @p has remained unscheduled the whole time.
2967 * The caller must ensure that the task *will* unschedule sometime soon,
2968 * else this function might spin for a *long* time. This function can't
2969 * be called with interrupts off, or it may introduce deadlock with
2970 * smp_call_function() if an IPI is sent by the same process we are
2971 * waiting to become inactive.
2973 unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
2975 int running, queued;
2982 * We do the initial early heuristics without holding
2983 * any task-queue locks at all. We'll only try to get
2984 * the runqueue lock when things look like they will
2990 * If the task is actively running on another CPU
2991 * still, just relax and busy-wait without holding
2994 * NOTE! Since we don't hold any locks, it's not
2995 * even sure that "rq" stays as the right runqueue!
2996 * But we don't care, since "task_running()" will
2997 * return false if the runqueue has changed and p
2998 * is actually now running somewhere else!
3000 while (task_running(rq, p)) {
3001 if (match_state && unlikely(READ_ONCE(p->__state) != match_state))
3007 * Ok, time to look more closely! We need the rq
3008 * lock now, to be *sure*. If we're wrong, we'll
3009 * just go back and repeat.
3011 rq = task_rq_lock(p, &rf);
3012 trace_sched_wait_task(p);
3013 running = task_running(rq, p);
3014 queued = task_on_rq_queued(p);
3016 if (!match_state || READ_ONCE(p->__state) == match_state)
3017 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
3018 task_rq_unlock(rq, p, &rf);
3021 * If it changed from the expected state, bail out now.
3023 if (unlikely(!ncsw))
3027 * Was it really running after all now that we
3028 * checked with the proper locks actually held?
3030 * Oops. Go back and try again..
3032 if (unlikely(running)) {
3038 * It's not enough that it's not actively running,
3039 * it must be off the runqueue _entirely_, and not
3042 * So if it was still runnable (but just not actively
3043 * running right now), it's preempted, and we should
3044 * yield - it could be a while.
3046 if (unlikely(queued)) {
3047 ktime_t to = NSEC_PER_SEC / HZ;
3049 set_current_state(TASK_UNINTERRUPTIBLE);
3050 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
3055 * Ahh, all good. It wasn't running, and it wasn't
3056 * runnable, which means that it will never become
3057 * running in the future either. We're all done!
3066 * kick_process - kick a running thread to enter/exit the kernel
3067 * @p: the to-be-kicked thread
3069 * Cause a process which is running on another CPU to enter
3070 * kernel-mode, without any delay. (to get signals handled.)
3072 * NOTE: this function doesn't have to take the runqueue lock,
3073 * because all it wants to ensure is that the remote task enters
3074 * the kernel. If the IPI races and the task has been migrated
3075 * to another CPU then no harm is done and the purpose has been
3078 void kick_process(struct task_struct *p)
3084 if ((cpu != smp_processor_id()) && task_curr(p))
3085 smp_send_reschedule(cpu);
3088 EXPORT_SYMBOL_GPL(kick_process);
3091 * ->cpus_ptr is protected by both rq->lock and p->pi_lock
3093 * A few notes on cpu_active vs cpu_online:
3095 * - cpu_active must be a subset of cpu_online
3097 * - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
3098 * see __set_cpus_allowed_ptr(). At this point the newly online
3099 * CPU isn't yet part of the sched domains, and balancing will not
3102 * - on CPU-down we clear cpu_active() to mask the sched domains and
3103 * avoid the load balancer to place new tasks on the to be removed
3104 * CPU. Existing tasks will remain running there and will be taken
3107 * This means that fallback selection must not select !active CPUs.
3108 * And can assume that any active CPU must be online. Conversely
3109 * select_task_rq() below may allow selection of !active CPUs in order
3110 * to satisfy the above rules.
3112 static int select_fallback_rq(int cpu, struct task_struct *p)
3114 int nid = cpu_to_node(cpu);
3115 const struct cpumask *nodemask = NULL;
3116 enum { cpuset, possible, fail } state = cpuset;
3120 * If the node that the CPU is on has been offlined, cpu_to_node()
3121 * will return -1. There is no CPU on the node, and we should
3122 * select the CPU on the other node.
3125 nodemask = cpumask_of_node(nid);
3127 /* Look for allowed, online CPU in same node. */
3128 for_each_cpu(dest_cpu, nodemask) {
3129 if (!cpu_active(dest_cpu))
3131 if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
3137 /* Any allowed, online CPU? */
3138 for_each_cpu(dest_cpu, p->cpus_ptr) {
3139 if (!is_cpu_allowed(p, dest_cpu))
3145 /* No more Mr. Nice Guy. */
3148 if (IS_ENABLED(CONFIG_CPUSETS)) {
3149 cpuset_cpus_allowed_fallback(p);
3156 * XXX When called from select_task_rq() we only
3157 * hold p->pi_lock and again violate locking order.
3159 * More yuck to audit.
3161 do_set_cpus_allowed(p, cpu_possible_mask);
3172 if (state != cpuset) {
3174 * Don't tell them about moving exiting tasks or
3175 * kernel threads (both mm NULL), since they never
3178 if (p->mm && printk_ratelimit()) {
3179 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
3180 task_pid_nr(p), p->comm, cpu);
3188 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
3191 int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
3193 lockdep_assert_held(&p->pi_lock);
3195 if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
3196 cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
3198 cpu = cpumask_any(p->cpus_ptr);
3201 * In order not to call set_task_cpu() on a blocking task we need
3202 * to rely on ttwu() to place the task on a valid ->cpus_ptr
3205 * Since this is common to all placement strategies, this lives here.
3207 * [ this allows ->select_task() to simply return task_cpu(p) and
3208 * not worry about this generic constraint ]
3210 if (unlikely(!is_cpu_allowed(p, cpu)))
3211 cpu = select_fallback_rq(task_cpu(p), p);
3216 void sched_set_stop_task(int cpu, struct task_struct *stop)
3218 static struct lock_class_key stop_pi_lock;
3219 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
3220 struct task_struct *old_stop = cpu_rq(cpu)->stop;
3224 * Make it appear like a SCHED_FIFO task, its something
3225 * userspace knows about and won't get confused about.
3227 * Also, it will make PI more or less work without too
3228 * much confusion -- but then, stop work should not
3229 * rely on PI working anyway.
3231 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
3233 stop->sched_class = &stop_sched_class;
3236 * The PI code calls rt_mutex_setprio() with ->pi_lock held to
3237 * adjust the effective priority of a task. As a result,
3238 * rt_mutex_setprio() can trigger (RT) balancing operations,
3239 * which can then trigger wakeups of the stop thread to push
3240 * around the current task.
3242 * The stop task itself will never be part of the PI-chain, it
3243 * never blocks, therefore that ->pi_lock recursion is safe.
3244 * Tell lockdep about this by placing the stop->pi_lock in its
3247 lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
3250 cpu_rq(cpu)->stop = stop;
3254 * Reset it back to a normal scheduling class so that
3255 * it can die in pieces.
3257 old_stop->sched_class = &rt_sched_class;
3261 #else /* CONFIG_SMP */
3263 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
3264 const struct cpumask *new_mask,
3267 return set_cpus_allowed_ptr(p, new_mask);
3270 static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
3272 static inline bool rq_has_pinned_tasks(struct rq *rq)
3277 #endif /* !CONFIG_SMP */
3280 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
3284 if (!schedstat_enabled())
3290 if (cpu == rq->cpu) {
3291 __schedstat_inc(rq->ttwu_local);
3292 __schedstat_inc(p->se.statistics.nr_wakeups_local);
3294 struct sched_domain *sd;
3296 __schedstat_inc(p->se.statistics.nr_wakeups_remote);
3298 for_each_domain(rq->cpu, sd) {
3299 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
3300 __schedstat_inc(sd->ttwu_wake_remote);
3307 if (wake_flags & WF_MIGRATED)
3308 __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
3309 #endif /* CONFIG_SMP */
3311 __schedstat_inc(rq->ttwu_count);
3312 __schedstat_inc(p->se.statistics.nr_wakeups);
3314 if (wake_flags & WF_SYNC)
3315 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
3319 * Mark the task runnable and perform wakeup-preemption.
3321 static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
3322 struct rq_flags *rf)
3324 check_preempt_curr(rq, p, wake_flags);
3325 WRITE_ONCE(p->__state, TASK_RUNNING);
3326 trace_sched_wakeup(p);
3329 if (p->sched_class->task_woken) {
3331 * Our task @p is fully woken up and running; so it's safe to
3332 * drop the rq->lock, hereafter rq is only used for statistics.
3334 rq_unpin_lock(rq, rf);
3335 p->sched_class->task_woken(rq, p);
3336 rq_repin_lock(rq, rf);
3339 if (rq->idle_stamp) {
3340 u64 delta = rq_clock(rq) - rq->idle_stamp;
3341 u64 max = 2*rq->max_idle_balance_cost;
3343 update_avg(&rq->avg_idle, delta);
3345 if (rq->avg_idle > max)
3348 rq->wake_stamp = jiffies;
3349 rq->wake_avg_idle = rq->avg_idle / 2;
3357 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
3358 struct rq_flags *rf)
3360 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
3362 lockdep_assert_rq_held(rq);
3364 if (p->sched_contributes_to_load)
3365 rq->nr_uninterruptible--;
3368 if (wake_flags & WF_MIGRATED)
3369 en_flags |= ENQUEUE_MIGRATED;
3373 delayacct_blkio_end(p);
3374 atomic_dec(&task_rq(p)->nr_iowait);
3377 activate_task(rq, p, en_flags);
3378 ttwu_do_wakeup(rq, p, wake_flags, rf);
3382 * Consider @p being inside a wait loop:
3385 * set_current_state(TASK_UNINTERRUPTIBLE);
3392 * __set_current_state(TASK_RUNNING);
3394 * between set_current_state() and schedule(). In this case @p is still
3395 * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
3398 * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
3399 * then schedule() must still happen and p->state can be changed to
3400 * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
3401 * need to do a full wakeup with enqueue.
3403 * Returns: %true when the wakeup is done,
3406 static int ttwu_runnable(struct task_struct *p, int wake_flags)
3412 rq = __task_rq_lock(p, &rf);
3413 if (task_on_rq_queued(p)) {
3414 /* check_preempt_curr() may use rq clock */
3415 update_rq_clock(rq);
3416 ttwu_do_wakeup(rq, p, wake_flags, &rf);
3419 __task_rq_unlock(rq, &rf);
3425 void sched_ttwu_pending(void *arg)
3427 struct llist_node *llist = arg;
3428 struct rq *rq = this_rq();
3429 struct task_struct *p, *t;
3436 * rq::ttwu_pending racy indication of out-standing wakeups.
3437 * Races such that false-negatives are possible, since they
3438 * are shorter lived that false-positives would be.
3440 WRITE_ONCE(rq->ttwu_pending, 0);
3442 rq_lock_irqsave(rq, &rf);
3443 update_rq_clock(rq);
3445 llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
3446 if (WARN_ON_ONCE(p->on_cpu))
3447 smp_cond_load_acquire(&p->on_cpu, !VAL);
3449 if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
3450 set_task_cpu(p, cpu_of(rq));
3452 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
3455 rq_unlock_irqrestore(rq, &rf);
3458 void send_call_function_single_ipi(int cpu)
3460 struct rq *rq = cpu_rq(cpu);
3462 if (!set_nr_if_polling(rq->idle))
3463 arch_send_call_function_single_ipi(cpu);
3465 trace_sched_wake_idle_without_ipi(cpu);
3469 * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
3470 * necessary. The wakee CPU on receipt of the IPI will queue the task
3471 * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
3472 * of the wakeup instead of the waker.
3474 static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
3476 struct rq *rq = cpu_rq(cpu);
3478 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
3480 WRITE_ONCE(rq->ttwu_pending, 1);
3481 __smp_call_single_queue(cpu, &p->wake_entry.llist);
3484 void wake_up_if_idle(int cpu)