Merge tag 'perf-tools-for-v5.15-2021-09-04' of git://git.kernel.org/pub/scm/linux...

[linux-2.6-microblaze.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 8dc6716..c4462c4 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -237,9 +237,30 @@ static DEFINE_MUTEX(sched_core_mutex);
  static atomic_t sched_core_count;
  static struct cpumask sched_core_mask;
  
+static void sched_core_lock(int cpu, unsigned long *flags)
+{
+       const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+       int t, i = 0;
+
+       local_irq_save(*flags);
+       for_each_cpu(t, smt_mask)
+               raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
+}
+
+static void sched_core_unlock(int cpu, unsigned long *flags)
+{
+       const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+       int t;
+
+       for_each_cpu(t, smt_mask)
+               raw_spin_unlock(&cpu_rq(t)->__lock);
+       local_irq_restore(*flags);
+}
+
  static void __sched_core_flip(bool enabled)
  {
-       int cpu, t, i;
+       unsigned long flags;
+       int cpu, t;
  
         cpus_read_lock();
  
@@ -250,19 +271,12 @@ static void __sched_core_flip(bool enabled)
         for_each_cpu(cpu, &sched_core_mask) {
                 const struct cpumask *smt_mask = cpu_smt_mask(cpu);
  
-               i = 0;
-               local_irq_disable();
-               for_each_cpu(t, smt_mask) {
-                       /* supports up to SMT8 */
-                       raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
-               }
+               sched_core_lock(cpu, &flags);
  
                 for_each_cpu(t, smt_mask)
                         cpu_rq(t)->core_enabled = enabled;
  
-               for_each_cpu(t, smt_mask)
-                       raw_spin_unlock(&cpu_rq(t)->__lock);
-               local_irq_enable();
+               sched_core_unlock(cpu, &flags);
  
                 cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
         }
@@ -3761,6 +3775,55 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
         rq_unlock(rq, &rf);
  }
  
+/*
+ * Invoked from try_to_wake_up() to check whether the task can be woken up.
+ *
+ * The caller holds p::pi_lock if p != current or has preemption
+ * disabled when p == current.
+ *
+ * The rules of PREEMPT_RT saved_state:
+ *
+ *   The related locking code always holds p::pi_lock when updating
+ *   p::saved_state, which means the code is fully serialized in both cases.
+ *
+ *   The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
+ *   bits set. This allows to distinguish all wakeup scenarios.
+ */
+static __always_inline
+bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
+{
+       if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
+               WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
+                            state != TASK_RTLOCK_WAIT);
+       }
+
+       if (READ_ONCE(p->__state) & state) {
+               *success = 1;
+               return true;
+       }
+
+#ifdef CONFIG_PREEMPT_RT
+       /*
+        * Saved state preserves the task state across blocking on
+        * an RT lock.  If the state matches, set p::saved_state to
+        * TASK_RUNNING, but do not wake the task because it waits
+        * for a lock wakeup. Also indicate success because from
+        * the regular waker's point of view this has succeeded.
+        *
+        * After acquiring the lock the task will restore p::__state
+        * from p::saved_state which ensures that the regular
+        * wakeup is not lost. The restore will also set
+        * p::saved_state to TASK_RUNNING so any further tests will
+        * not result in false positives vs. @success
+        */
+       if (p->saved_state & state) {
+               p->saved_state = TASK_RUNNING;
+               *success = 1;
+       }
+#endif
+       return false;
+}
+
  /*
   * Notes on Program-Order guarantees on SMP systems.
   *
@@ -3900,10 +3963,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
                  *  - we're serialized against set_special_state() by virtue of
                  *    it disabling IRQs (this allows not taking ->pi_lock).
                  */
-               if (!(READ_ONCE(p->__state) & state))
+               if (!ttwu_state_match(p, state, &success))
                         goto out;
  
-               success = 1;
                 trace_sched_waking(p);
                 WRITE_ONCE(p->__state, TASK_RUNNING);
                 trace_sched_wakeup(p);
@@ -3918,14 +3980,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
          */
         raw_spin_lock_irqsave(&p->pi_lock, flags);
         smp_mb__after_spinlock();
-       if (!(READ_ONCE(p->__state) & state))
+       if (!ttwu_state_match(p, state, &success))
                 goto unlock;
  
         trace_sched_waking(p);
  
-       /* We're going to change ->state: */
-       success = 1;
-
         /*
          * Ensure we load p->on_rq _after_ p->state, otherwise it would
          * be possible to, falsely, observe p->on_rq == 0 and get stuck
@@ -5934,35 +5993,109 @@ void queue_core_balance(struct rq *rq)
         queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
  }
  
-static inline void sched_core_cpu_starting(unsigned int cpu)
+static void sched_core_cpu_starting(unsigned int cpu)
  {
         const struct cpumask *smt_mask = cpu_smt_mask(cpu);
-       struct rq *rq, *core_rq = NULL;
-       int i;
+       struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+       unsigned long flags;
+       int t;
+
+       sched_core_lock(cpu, &flags);
  
-       core_rq = cpu_rq(cpu)->core;
+       WARN_ON_ONCE(rq->core != rq);
  
-       if (!core_rq) {
-               for_each_cpu(i, smt_mask) {
-                       rq = cpu_rq(i);
-                       if (rq->core && rq->core == rq)
-                               core_rq = rq;
+       /* if we're the first, we'll be our own leader */
+       if (cpumask_weight(smt_mask) == 1)
+               goto unlock;
+
+       /* find the leader */
+       for_each_cpu(t, smt_mask) {
+               if (t == cpu)
+                       continue;
+               rq = cpu_rq(t);
+               if (rq->core == rq) {
+                       core_rq = rq;
+                       break;
                 }
+       }
  
-               if (!core_rq)
-                       core_rq = cpu_rq(cpu);
+       if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
+               goto unlock;
  
-               for_each_cpu(i, smt_mask) {
-                       rq = cpu_rq(i);
+       /* install and validate core_rq */
+       for_each_cpu(t, smt_mask) {
+               rq = cpu_rq(t);
  
-                       WARN_ON_ONCE(rq->core && rq->core != core_rq);
+               if (t == cpu)
                         rq->core = core_rq;
-               }
+
+               WARN_ON_ONCE(rq->core != core_rq);
         }
+
+unlock:
+       sched_core_unlock(cpu, &flags);
+}
+
+static void sched_core_cpu_deactivate(unsigned int cpu)
+{
+       const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+       struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+       unsigned long flags;
+       int t;
+
+       sched_core_lock(cpu, &flags);
+
+       /* if we're the last man standing, nothing to do */
+       if (cpumask_weight(smt_mask) == 1) {
+               WARN_ON_ONCE(rq->core != rq);
+               goto unlock;
+       }
+
+       /* if we're not the leader, nothing to do */
+       if (rq->core != rq)
+               goto unlock;
+
+       /* find a new leader */
+       for_each_cpu(t, smt_mask) {
+               if (t == cpu)
+                       continue;
+               core_rq = cpu_rq(t);
+               break;
+       }
+
+       if (WARN_ON_ONCE(!core_rq)) /* impossible */
+               goto unlock;
+
+       /* copy the shared state to the new leader */
+       core_rq->core_task_seq      = rq->core_task_seq;
+       core_rq->core_pick_seq      = rq->core_pick_seq;
+       core_rq->core_cookie        = rq->core_cookie;
+       core_rq->core_forceidle     = rq->core_forceidle;
+       core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+
+       /* install new leader */
+       for_each_cpu(t, smt_mask) {
+               rq = cpu_rq(t);
+               rq->core = core_rq;
+       }
+
+unlock:
+       sched_core_unlock(cpu, &flags);
  }
+
+static inline void sched_core_cpu_dying(unsigned int cpu)
+{
+       struct rq *rq = cpu_rq(cpu);
+
+       if (rq->core != rq)
+               rq->core = rq;
+}
+
  #else /* !CONFIG_SCHED_CORE */
  
  static inline void sched_core_cpu_starting(unsigned int cpu) {}
+static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
+static inline void sched_core_cpu_dying(unsigned int cpu) {}
  
  static struct task_struct *
  pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
@@ -5972,6 +6105,24 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  
  #endif /* CONFIG_SCHED_CORE */
  
+/*
+ * Constants for the sched_mode argument of __schedule().
+ *
+ * The mode argument allows RT enabled kernels to differentiate a
+ * preemption from blocking on an 'sleeping' spin/rwlock. Note that
+ * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
+ * optimize the AND operation out and just check for zero.
+ */
+#define SM_NONE                        0x0
+#define SM_PREEMPT             0x1
+#define SM_RTLOCK_WAIT         0x2
+
+#ifndef CONFIG_PREEMPT_RT
+# define SM_MASK_PREEMPT       (~0U)
+#else
+# define SM_MASK_PREEMPT       SM_PREEMPT
+#endif
+
  /*
   * __schedule() is the main scheduler function.
   *
@@ -6011,7 +6162,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
   *
   * WARNING: must be called with preemption disabled!
   */
-static void __sched notrace __schedule(bool preempt)
+static void __sched notrace __schedule(unsigned int sched_mode)
  {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
@@ -6024,13 +6175,13 @@ static void __sched notrace __schedule(bool preempt)
         rq = cpu_rq(cpu);
         prev = rq->curr;
  
-       schedule_debug(prev, preempt);
+       schedule_debug(prev, !!sched_mode);
  
         if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
                 hrtick_clear(rq);
  
         local_irq_disable();
-       rcu_note_context_switch(preempt);
+       rcu_note_context_switch(!!sched_mode);
  
         /*
          * Make sure that signal_pending_state()->signal_pending() below
@@ -6064,7 +6215,7 @@ static void __sched notrace __schedule(bool preempt)
          *  - ptrace_{,un}freeze_traced() can change ->state underneath us.
          */
         prev_state = READ_ONCE(prev->__state);
-       if (!preempt && prev_state) {
+       if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
                 if (signal_pending_state(prev_state, prev)) {
                         WRITE_ONCE(prev->__state, TASK_RUNNING);
                 } else {
@@ -6130,7 +6281,7 @@ static void __sched notrace __schedule(bool preempt)
                 migrate_disable_switch(rq, prev);
                 psi_sched_switch(prev, next, !task_on_rq_queued(prev));
  
-               trace_sched_switch(preempt, prev, next);
+               trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
  
                 /* Also unlocks the rq: */
                 rq = context_switch(rq, prev, next, &rf);
@@ -6151,7 +6302,7 @@ void __noreturn do_task_dead(void)
         /* Tell freezer to ignore us: */
         current->flags |= PF_NOFREEZE;
  
-       __schedule(false);
+       __schedule(SM_NONE);
         BUG();
  
         /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
@@ -6212,7 +6363,7 @@ asmlinkage __visible void __sched schedule(void)
         sched_submit_work(tsk);
         do {
                 preempt_disable();
-               __schedule(false);
+               __schedule(SM_NONE);
                 sched_preempt_enable_no_resched();
         } while (need_resched());
         sched_update_worker(tsk);
@@ -6240,7 +6391,7 @@ void __sched schedule_idle(void)
          */
         WARN_ON_ONCE(current->__state);
         do {
-               __schedule(false);
+               __schedule(SM_NONE);
         } while (need_resched());
  }
  
@@ -6275,6 +6426,18 @@ void __sched schedule_preempt_disabled(void)
         preempt_disable();
  }
  
+#ifdef CONFIG_PREEMPT_RT
+void __sched notrace schedule_rtlock(void)
+{
+       do {
+               preempt_disable();
+               __schedule(SM_RTLOCK_WAIT);
+               sched_preempt_enable_no_resched();
+       } while (need_resched());
+}
+NOKPROBE_SYMBOL(schedule_rtlock);
+#endif
+
  static void __sched notrace preempt_schedule_common(void)
  {
         do {
@@ -6293,7 +6456,7 @@ static void __sched notrace preempt_schedule_common(void)
                  */
                 preempt_disable_notrace();
                 preempt_latency_start(1);
-               __schedule(true);
+               __schedule(SM_PREEMPT);
                 preempt_latency_stop(1);
                 preempt_enable_no_resched_notrace();
  
@@ -6372,7 +6535,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
                  * an infinite recursion.
                  */
                 prev_ctx = exception_enter();
-               __schedule(true);
+               __schedule(SM_PREEMPT);
                 exception_exit(prev_ctx);
  
                 preempt_latency_stop(1);
@@ -6521,7 +6684,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
         do {
                 preempt_disable();
                 local_irq_enable();
-               __schedule(true);
+               __schedule(SM_PREEMPT);
                 local_irq_disable();
                 sched_preempt_enable_no_resched();
         } while (need_resched());
@@ -7988,6 +8151,17 @@ int __sched __cond_resched(void)
                 preempt_schedule_common();
                 return 1;
         }
+       /*
+        * In preemptible kernels, ->rcu_read_lock_nesting tells the tick
+        * whether the current CPU is in an RCU read-side critical section,
+        * so the tick can report quiescent states even for CPUs looping
+        * in kernel context.  In contrast, in non-preemptible kernels,
+        * RCU readers leave no in-memory hints, which means that CPU-bound
+        * processes executing in kernel context might never report an
+        * RCU quiescent state.  Therefore, the following code causes
+        * cond_resched() to report a quiescent state, but only when RCU
+        * is in urgent need of one.
+        */
  #ifndef CONFIG_PREEMPT_RCU
         rcu_all_qs();
  #endif
@@ -8934,6 +9108,8 @@ int sched_cpu_deactivate(unsigned int cpu)
          */
         if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
                 static_branch_dec_cpuslocked(&sched_smt_present);
+
+       sched_core_cpu_deactivate(cpu);
  #endif
  
         if (!sched_smp_initialized)
@@ -9038,6 +9214,7 @@ int sched_cpu_dying(unsigned int cpu)
         calc_load_migrate(rq);
         update_max_interval();
         hrtick_clear(rq);
+       sched_core_cpu_dying(cpu);
         return 0;
  }
  #endif
@@ -9249,7 +9426,7 @@ void __init sched_init(void)
                 atomic_set(&rq->nr_iowait, 0);
  
  #ifdef CONFIG_SCHED_CORE
-               rq->core = NULL;
+               rq->core = rq;
                 rq->core_pick = NULL;
                 rq->core_enabled = 0;
                 rq->core_tree = RB_ROOT;