Merge tag 'v5.11' into sched/core, to pick up fixes & refresh the branch

author Ingo Molnar <mingo@kernel.org>

Wed, 17 Feb 2021 13:04:39 +0000 (14:04 +0100)

committer Ingo Molnar <mingo@kernel.org>

Wed, 17 Feb 2021 13:04:39 +0000 (14:04 +0100)
author Ingo Molnar <mingo@kernel.org>
Wed, 17 Feb 2021 13:04:39 +0000 (14:04 +0100)
committer Ingo Molnar <mingo@kernel.org>
Wed, 17 Feb 2021 13:04:39 +0000 (14:04 +0100)
diff --combined init/Kconfig

index 058b99d,29ad683..a104696
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -76,7 -76,6 +76,6 @@@ config CC_HAS_ASM_INLIN
   
   config CONSTRUCTORS
         bool
-       depends on !UML
   
   config IRQ_WORK
         bool
@@@ -525,7 -524,7 +524,7 @@@ config SCHED_THERMAL_PRESSUR
           i.e. put less load on throttled CPUs than on non/less throttled ones.
   
           This requires the architecture to implement
- -        arch_set_thermal_pressure() and arch_get_thermal_pressure().
+ +        arch_set_thermal_pressure() and arch_scale_thermal_pressure().
   
   config BSD_PROCESS_ACCT
         bool "BSD Process Accounting"
diff --combined kernel/sched/core.c

index 06b4499,ff74fca..6c789dc
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -1796,13 -1796,28 +1796,28 @@@ static inline bool rq_has_pinned_tasks(
    */
   static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
   {
+       /* When not in the task's cpumask, no point in looking further. */
         if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                 return false;
   
-       if (is_per_cpu_kthread(p) || is_migration_disabled(p))
+       /* migrate_disabled() must be allowed to finish. */
+       if (is_migration_disabled(p))
                 return cpu_online(cpu);
   
-       return cpu_active(cpu);
+       /* Non kernel threads are not allowed during either online or offline. */
+       if (!(p->flags & PF_KTHREAD))
+               return cpu_active(cpu);
+ 
+       /* KTHREAD_IS_PER_CPU is always allowed. */
+       if (kthread_is_per_cpu(p))
+               return cpu_online(cpu);
+ 
+       /* Regular kernel threads don't get to stay during offline. */
+       if (cpu_rq(cpu)->balance_push)
+               return false;
+ 
+       /* But are allowed during online. */
+       return cpu_online(cpu);
   }
   
   /*
@@@ -2327,7 -2342,9 +2342,9 @@@ static int __set_cpus_allowed_ptr(struc
   
         if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
                 /*
-                * Kernel threads are allowed on online && !active CPUs.
+                * Kernel threads are allowed on online && !active CPUs,
+                * however, during cpu-hot-unplug, even these might get pushed
+                * away if not KTHREAD_IS_PER_CPU.
                  *
                  * Specifically, migration_disabled() tasks must not fail the
                  * cpumask_any_and_distribute() pick below, esp. so on
@@@ -2371,16 -2388,6 +2388,6 @@@
   
         __do_set_cpus_allowed(p, new_mask, flags);
   
-       if (p->flags & PF_KTHREAD) {
-               /*
-                * For kernel threads that do indeed end up on online &&
-                * !active we want to ensure they are strict per-CPU threads.
-                */
-               WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
-                       !cpumask_intersects(new_mask, cpu_active_mask) &&
-                       p->nr_cpus_allowed != 1);
-       }
- 
         return affine_move_task(rq, p, &rf, dest_cpu, flags);
   
   out:
@@@ -3121,6 -3128,13 +3128,13 @@@ bool cpus_share_cache(int this_cpu, in
   
   static inline bool ttwu_queue_cond(int cpu, int wake_flags)
   {
+       /*
+        * Do not complicate things with the async wake_list while the CPU is
+        * in hotplug state.
+        */
+       if (!cpu_active(cpu))
+               return false;
+ 
         /*
          * If the CPU does not share cache, then queue the task on the
          * remote rqs wakelist to avoid accessing remote data.
@@@ -5662,120 -5676,6 +5676,120 @@@ struct task_struct *idle_task(int cpu
         return cpu_rq(cpu)->idle;
   }
   
+ +#ifdef CONFIG_SMP
+ +/*
+ + * This function computes an effective utilization for the given CPU, to be
+ + * used for frequency selection given the linear relation: f = u * f_max.
+ + *
+ + * The scheduler tracks the following metrics:
+ + *
+ + *   cpu_util_{cfs,rt,dl,irq}()
+ + *   cpu_bw_dl()
+ + *
+ + * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ + * synchronized windows and are thus directly comparable.
+ + *
+ + * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ + * which excludes things like IRQ and steal-time. These latter are then accrued
+ + * in the irq utilization.
+ + *
+ + * The DL bandwidth number otoh is not a measured metric but a value computed
+ + * based on the task model parameters and gives the minimal utilization
+ + * required to meet deadlines.
+ + */
+ +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ +                               unsigned long max, enum cpu_util_type type,
+ +                               struct task_struct *p)
+ +{
+ +      unsigned long dl_util, util, irq;
+ +      struct rq *rq = cpu_rq(cpu);
+ +
+ +      if (!uclamp_is_used() &&
+ +          type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
+ +              return max;
+ +      }
+ +
+ +      /*
+ +       * Early check to see if IRQ/steal time saturates the CPU, can be
+ +       * because of inaccuracies in how we track these -- see
+ +       * update_irq_load_avg().
+ +       */
+ +      irq = cpu_util_irq(rq);
+ +      if (unlikely(irq >= max))
+ +              return max;
+ +
+ +      /*
+ +       * Because the time spend on RT/DL tasks is visible as 'lost' time to
+ +       * CFS tasks and we use the same metric to track the effective
+ +       * utilization (PELT windows are synchronized) we can directly add them
+ +       * to obtain the CPU's actual utilization.
+ +       *
+ +       * CFS and RT utilization can be boosted or capped, depending on
+ +       * utilization clamp constraints requested by currently RUNNABLE
+ +       * tasks.
+ +       * When there are no CFS RUNNABLE tasks, clamps are released and
+ +       * frequency will be gracefully reduced with the utilization decay.
+ +       */
+ +      util = util_cfs + cpu_util_rt(rq);
+ +      if (type == FREQUENCY_UTIL)
+ +              util = uclamp_rq_util_with(rq, util, p);
+ +
+ +      dl_util = cpu_util_dl(rq);
+ +
+ +      /*
+ +       * For frequency selection we do not make cpu_util_dl() a permanent part
+ +       * of this sum because we want to use cpu_bw_dl() later on, but we need
+ +       * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
+ +       * that we select f_max when there is no idle time.
+ +       *
+ +       * NOTE: numerical errors or stop class might cause us to not quite hit
+ +       * saturation when we should -- something for later.
+ +       */
+ +      if (util + dl_util >= max)
+ +              return max;
+ +
+ +      /*
+ +       * OTOH, for energy computation we need the estimated running time, so
+ +       * include util_dl and ignore dl_bw.
+ +       */
+ +      if (type == ENERGY_UTIL)
+ +              util += dl_util;
+ +
+ +      /*
+ +       * There is still idle time; further improve the number by using the
+ +       * irq metric. Because IRQ/steal time is hidden from the task clock we
+ +       * need to scale the task numbers:
+ +       *
+ +       *              max - irq
+ +       *   U' = irq + --------- * U
+ +       *                 max
+ +       */
+ +      util = scale_irq_capacity(util, irq, max);
+ +      util += irq;
+ +
+ +      /*
+ +       * Bandwidth required by DEADLINE must always be granted while, for
+ +       * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
+ +       * to gracefully reduce the frequency when no tasks show up for longer
+ +       * periods of time.
+ +       *
+ +       * Ideally we would like to set bw_dl as min/guaranteed freq and util +
+ +       * bw_dl as requested freq. However, cpufreq is not yet ready for such
+ +       * an interface. So, we only do the latter for now.
+ +       */
+ +      if (type == FREQUENCY_UTIL)
+ +              util += cpu_bw_dl(rq);
+ +
+ +      return min(max, util);
+ +}
+ +
+ +unsigned long sched_cpu_util(int cpu, unsigned long max)
+ +{
+ +      return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
+ +                                ENERGY_UTIL, NULL);
+ +}
+ +#endif /* CONFIG_SMP */
+ +
   /**
    * find_process_by_pid - find a process with a matching PID value.
    * @pid: the pid in question.
@@@ -7390,8 -7290,14 +7404,14 @@@ static void balance_push(struct rq *rq
         /*
          * Both the cpu-hotplug and stop task are in this case and are
          * required to complete the hotplug process.
+        *
+        * XXX: the idle task does not match kthread_is_per_cpu() due to
+        * histerical raisins.
          */
-       if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
+       if (rq->idle == push_task ||
+           ((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) ||
+           is_migration_disabled(push_task)) {
+ 
                 /*
                  * If this is the idle task on the outgoing CPU try to wake
                  * up the hotplug control thread which might wait for the
@@@ -7423,7 -7329,7 +7443,7 @@@
         /*
          * At this point need_resched() is true and we'll take the loop in
          * schedule(). The next pick is obviously going to be the stop task
-        * which is_per_cpu_kthread() and will push this task away.
+        * which kthread_is_per_cpu() and will push this task away.
          */
         raw_spin_lock(&rq->lock);
   }
@@@ -7434,10 -7340,13 +7454,13 @@@ static void balance_push_set(int cpu, b
         struct rq_flags rf;
   
         rq_lock_irqsave(rq, &rf);
-       if (on)
+       rq->balance_push = on;
+       if (on) {
+               WARN_ON_ONCE(rq->balance_callback);
                 rq->balance_callback = &balance_push_callback;
-       else
+       } else if (rq->balance_callback == &balance_push_callback) {
                 rq->balance_callback = NULL;
+       }
         rq_unlock_irqrestore(rq, &rf);
   }
   
@@@ -7555,6 -7464,10 +7578,10 @@@ int sched_cpu_activate(unsigned int cpu
         struct rq *rq = cpu_rq(cpu);
         struct rq_flags rf;
   
+       /*
+        * Make sure that when the hotplug state machine does a roll-back
+        * we clear balance_push. Ideally that would happen earlier...
+        */
         balance_push_set(cpu, false);
   
   #ifdef CONFIG_SCHED_SMT
@@@ -7596,24 -7509,28 +7623,34 @@@ int sched_cpu_deactivate(unsigned int c
         struct rq_flags rf;
         int ret;
   
+ +      /*
+ +       * Remove CPU from nohz.idle_cpus_mask to prevent participating in
+ +       * load balancing when not active
+ +       */
+ +      nohz_balance_exit_idle(rq);
+ +
         set_cpu_active(cpu, false);
+ 
+       /*
+        * From this point forward, this CPU will refuse to run any task that
+        * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
+        * push those tasks away until this gets cleared, see
+        * sched_cpu_dying().
+        */
+       balance_push_set(cpu, true);
+ 
         /*
-        * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
-        * users of this state to go away such that all new such users will
-        * observe it.
+        * We've cleared cpu_active_mask / set balance_push, wait for all
+        * preempt-disabled and RCU users of this state to go away such that
+        * all new such users will observe it.
+        *
+        * Specifically, we rely on ttwu to no longer target this CPU, see
+        * ttwu_queue_cond() and is_cpu_allowed().
          *
          * Do sync before park smpboot threads to take care the rcu boost case.
          */
         synchronize_rcu();
   
-       balance_push_set(cpu, true);
- 
         rq_lock_irqsave(rq, &rf);
         if (rq->rd) {
                 update_rq_clock(rq);
@@@ -7694,6 -7611,25 +7731,25 @@@ static void calc_load_migrate(struct r
                 atomic_long_add(delta, &calc_load_tasks);
   }
   
+ static void dump_rq_tasks(struct rq *rq, const char *loglvl)
+ {
+       struct task_struct *g, *p;
+       int cpu = cpu_of(rq);
+ 
+       lockdep_assert_held(&rq->lock);
+ 
+       printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
+       for_each_process_thread(g, p) {
+               if (task_cpu(p) != cpu)
+                       continue;
+ 
+               if (!task_on_rq_queued(p))
+                       continue;
+ 
+               printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
+       }
+ }
+ 
   int sched_cpu_dying(unsigned int cpu)
   {
         struct rq *rq = cpu_rq(cpu);
@@@ -7703,11 -7639,21 +7759,20 @@@
         sched_tick_stop(cpu);
   
         rq_lock_irqsave(rq, &rf);
-       BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
+       if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
+               WARN(true, "Dying CPU not properly vacated!");
+               dump_rq_tasks(rq, KERN_WARNING);
+       }
         rq_unlock_irqrestore(rq, &rf);
   
+       /*
+        * Now that the CPU is offline, make sure we're welcome
+        * to new tasks once we come back up.
+        */
+       balance_push_set(cpu, false);
+ 
         calc_load_migrate(rq);
         update_max_interval();
- -      nohz_balance_exit_idle(rq);
         hrtick_clear(rq);
         return 0;
   }
diff --combined kernel/sched/sched.h

index 045b010,bb09988..f519aba
--- 1/kernel/sched/sched.h
--- 2/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@@ -975,6 -975,7 +975,7 @@@ struct rq 
         unsigned long           cpu_capacity_orig;
   
         struct callback_head    *balance_callback;
+       unsigned char           balance_push;
   
         unsigned char           nohz_idle_balance;
         unsigned char           idle_balance;
@@@ -2557,24 -2558,27 +2558,24 @@@ static inline unsigned long capacity_or
   {
         return cpu_rq(cpu)->cpu_capacity_orig;
   }
- -#endif
   
   /**
- - * enum schedutil_type - CPU utilization type
+ + * enum cpu_util_type - CPU utilization type
    * @FREQUENCY_UTIL:   Utilization used to select frequency
    * @ENERGY_UTIL:      Utilization used during energy calculation
    *
    * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time
    * need to be aggregated differently depending on the usage made of them. This
- - * enum is used within schedutil_freq_util() to differentiate the types of
+ + * enum is used within effective_cpu_util() to differentiate the types of
    * utilization expected by the callers, and adjust the aggregation accordingly.
    */
- -enum schedutil_type {
+ +enum cpu_util_type {
         FREQUENCY_UTIL,
         ENERGY_UTIL,
   };
   
- -#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
- -
- -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
- -                               unsigned long max, enum schedutil_type type,
+ +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ +                               unsigned long max, enum cpu_util_type type,
                                  struct task_struct *p);
   
   static inline unsigned long cpu_bw_dl(struct rq *rq)
@@@ -2603,7 -2607,14 +2604,7 @@@ static inline unsigned long cpu_util_rt
   {
         return READ_ONCE(rq->avg_rt.util_avg);
   }
- -#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
- -static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
- -                               unsigned long max, enum schedutil_type type,
- -                               struct task_struct *p)
- -{
- -      return 0;
- -}
- -#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+ +#endif
   
   #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
   static inline unsigned long cpu_util_irq(struct rq *rq)
author	Ingo Molnar <mingo@kernel.org>
	Wed, 17 Feb 2021 13:04:39 +0000 (14:04 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Wed, 17 Feb 2021 13:04:39 +0000 (14:04 +0100)
		1	2
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history