Merge branch 'sched/urgent' into sched/core, to resolve conflicts

author Ingo Molnar <mingo@kernel.org>

Fri, 18 Jun 2021 09:31:25 +0000 (11:31 +0200)

committer Ingo Molnar <mingo@kernel.org>

Fri, 18 Jun 2021 09:31:25 +0000 (11:31 +0200)
author Ingo Molnar <mingo@kernel.org>
Fri, 18 Jun 2021 09:31:25 +0000 (11:31 +0200)
committer Ingo Molnar <mingo@kernel.org>
Fri, 18 Jun 2021 09:31:25 +0000 (11:31 +0200)
diff --combined include/linux/sched.h

index c7e7d50,28a98fc..ac5a7d2
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -350,11 -350,19 +350,19 @@@ struct load_weight 
    * Only for tasks we track a moving average of the past instantaneous
    * estimated utilization. This allows to absorb sporadic drops in utilization
    * of an otherwise almost periodic task.
+  *
+  * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
+  * updates. When a task is dequeued, its util_est should not be updated if its
+  * util_avg has not been updated in the meantime.
+  * This information is mapped into the MSB bit of util_est.enqueued at dequeue
+  * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
+  * for a task) it is safe to use MSB.
    */
   struct util_est {
         unsigned int                    enqueued;
         unsigned int                    ewma;
   #define UTIL_EST_WEIGHT_SHIFT         2
+ #define UTIL_AVG_UNCHANGED            0x80000000
   } __attribute__((__aligned__(sizeof(u64))));
   
   /*
@@@ -700,17 -708,10 +708,17 @@@ struct task_struct 
         const struct sched_class        *sched_class;
         struct sched_entity             se;
         struct sched_rt_entity          rt;
+ +      struct sched_dl_entity          dl;
+ +
+ +#ifdef CONFIG_SCHED_CORE
+ +      struct rb_node                  core_node;
+ +      unsigned long                   core_cookie;
+ +      unsigned int                    core_occupation;
+ +#endif
+ +
   #ifdef CONFIG_CGROUP_SCHED
         struct task_group               *sched_task_group;
   #endif
- -      struct sched_dl_entity          dl;
   
   #ifdef CONFIG_UCLAMP_TASK
         /*
@@@ -2179,14 -2180,4 +2187,14 @@@ int sched_trace_rq_nr_running(struct r
   
   const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
   
+ +#ifdef CONFIG_SCHED_CORE
+ +extern void sched_core_free(struct task_struct *tsk);
+ +extern void sched_core_fork(struct task_struct *p);
+ +extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
+ +                              unsigned long uaddr);
+ +#else
+ +static inline void sched_core_free(struct task_struct *tsk) { }
+ +static inline void sched_core_fork(struct task_struct *p) { }
+ +#endif
+ +
   #endif
diff --combined init/main.c

index e945ec8,e9c42a1..3593585
--- 1/init/main.c
--- 2/init/main.c
+++ b/init/main.c
@@@ -692,7 -692,6 +692,7 @@@ noinline void __ref rest_init(void
          */
         rcu_read_lock();
         tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+ +      tsk->flags |= PF_NO_SETAFFINITY;
         set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
         rcu_read_unlock();
   
@@@ -942,7 -941,11 +942,7 @@@ asmlinkage __visible void __init __no_s
          * time - but meanwhile we still have a functioning scheduler.
          */
         sched_init();
- -      /*
- -       * Disable preemption - early bootup scheduling is extremely
- -       * fragile until we cpu_idle() for the first time.
- -       */
- -      preempt_disable();
+ +
         if (WARN(!irqs_disabled(),
                  "Interrupts were enabled *very* early, fixing it\n"))
                 local_irq_disable();
@@@ -1441,11 -1444,6 +1441,11 @@@ static int __ref kernel_init(void *unus
   {
         int ret;
   
+ +      /*
+ +       * Wait until kthreadd is all set-up.
+ +       */
+ +      wait_for_completion(&kthreadd_done);
+ +
         kernel_init_freeable();
         /* need to finish all async __init code before freeing the memory */
         async_synchronize_full();
@@@ -1526,6 -1524,11 +1526,6 @@@ void __init console_on_rootfs(void
   
   static noinline void __init kernel_init_freeable(void)
   {
- -      /*
- -       * Wait until kthreadd is all set-up.
- -       */
- -      wait_for_completion(&kthreadd_done);
- -
         /* Now the scheduler is fully set up and can do blocking allocations */
         gfp_allowed_mask = __GFP_BITS_MASK;
   
@@@ -1534,7 -1537,7 +1534,7 @@@
          */
         set_mems_allowed(node_states[N_MEMORY]);
   
-       cad_pid = task_pid(current);
+       cad_pid = get_pid(task_pid(current));
   
         smp_prepare_cpus(setup_max_cpus);
   
diff --combined kernel/sched/debug.c

index 3bdee5f,c5aacbd..0c5ec27
--- 1/kernel/sched/debug.c
--- 2/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@@ -576,7 -576,7 +576,7 @@@ void print_cfs_rq(struct seq_file *m, i
         SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
                         SPLIT_NS(cfs_rq->exec_clock));
   
- -      raw_spin_lock_irqsave(&rq->lock, flags);
+ +      raw_spin_rq_lock_irqsave(rq, flags);
         if (rb_first_cached(&cfs_rq->tasks_timeline))
                 MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
         last = __pick_last_entity(cfs_rq);
@@@ -584,7 -584,7 +584,7 @@@
                 max_vruntime = last->vruntime;
         min_vruntime = cfs_rq->min_vruntime;
         rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
- -      raw_spin_unlock_irqrestore(&rq->lock, flags);
+ +      raw_spin_rq_unlock_irqrestore(rq, flags);
         SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
                         SPLIT_NS(MIN_vruntime));
         SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime",
@@@ -885,6 -885,7 +885,7 @@@ static const struct seq_operations sche
   #define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
   #define __P(F) __PS(#F, F)
   #define   P(F) __PS(#F, p->F)
+ #define   PM(F, M) __PS(#F, p->F & (M))
   #define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
   #define __PN(F) __PSN(#F, F)
   #define   PN(F) __PSN(#F, p->F)
@@@ -1011,7 -1012,7 +1012,7 @@@ void proc_sched_show_task(struct task_s
         P(se.avg.util_avg);
         P(se.avg.last_update_time);
         P(se.avg.util_est.ewma);
-       P(se.avg.util_est.enqueued);
+       PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
   #endif
   #ifdef CONFIG_UCLAMP_TASK
         __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
diff --combined kernel/sched/fair.c

index 3af4afe,bfaa6e1..5d1a6aa
--- 1/kernel/sched/fair.c
--- 2/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@@ -268,11 -268,33 +268,11 @@@ const struct sched_class fair_sched_cla
    */
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
- -static inline struct task_struct *task_of(struct sched_entity *se)
- -{
- -      SCHED_WARN_ON(!entity_is_task(se));
- -      return container_of(se, struct task_struct, se);
- -}
   
   /* Walk up scheduling entities hierarchy */
   #define for_each_sched_entity(se) \
                 for (; se; se = se->parent)
   
- -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
- -{
- -      return p->se.cfs_rq;
- -}
- -
- -/* runqueue on which this entity is (to be) queued */
- -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
- -{
- -      return se->cfs_rq;
- -}
- -
- -/* runqueue "owned" by this group */
- -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
- -{
- -      return grp->my_q;
- -}
- -
   static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
   {
         if (!path)
@@@ -433,9 -455,33 +433,9 @@@ find_matching_se(struct sched_entity **
   
   #else /* !CONFIG_FAIR_GROUP_SCHED */
   
- -static inline struct task_struct *task_of(struct sched_entity *se)
- -{
- -      return container_of(se, struct task_struct, se);
- -}
- -
   #define for_each_sched_entity(se) \
                 for (; se; se = NULL)
   
- -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
- -{
- -      return &task_rq(p)->cfs;
- -}
- -
- -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
- -{
- -      struct task_struct *p = task_of(se);
- -      struct rq *rq = task_rq(p);
- -
- -      return &rq->cfs;
- -}
- -
- -/* runqueue "owned" by this group */
- -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
- -{
- -      return NULL;
- -}
- -
   static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
   {
         if (path)
@@@ -1061,7 -1107,7 +1061,7 @@@ struct numa_group 
   static struct numa_group *deref_task_numa_group(struct task_struct *p)
   {
         return rcu_dereference_check(p->numa_group, p == current ||
- -              (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
+ +              (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
   }
   
   static struct numa_group *deref_curr_numa_group(struct task_struct *p)
@@@ -3093,7 -3139,7 +3093,7 @@@ void reweight_task(struct task_struct *
    *
    *                     tg->weight * grq->load.weight
    *   ge->load.weight = -----------------------------               (1)
- - *                      \Sum grq->load.weight
+ + *                       \Sum grq->load.weight
    *
    * Now, because computing that sum is prohibitively expensive to compute (been
    * there, done that) we approximate it with this average stuff. The average
@@@ -3107,7 -3153,7 +3107,7 @@@
    *
    *                     tg->weight * grq->avg.load_avg
    *   ge->load.weight = ------------------------------              (3)
- - *                            tg->load_avg
+ + *                             tg->load_avg
    *
    * Where: tg->load_avg ~= \Sum grq->avg.load_avg
    *
@@@ -3123,7 -3169,7 +3123,7 @@@
    *
    *                     tg->weight * grq->load.weight
    *   ge->load.weight = ----------------------------- = tg->weight   (4)
- - *                        grp->load.weight
+ + *                         grp->load.weight
    *
    * That is, the sum collapses because all other CPUs are idle; the UP scenario.
    *
@@@ -3142,7 -3188,7 +3142,7 @@@
    *
    *                     tg->weight * grq->load.weight
    *   ge->load.weight = -----------------------------             (6)
- - *                            tg_load_avg'
+ + *                             tg_load_avg'
    *
    * Where:
    *
@@@ -3252,6 -3298,24 +3252,33 @@@ static inline void cfs_rq_util_change(s
   
   #ifdef CONFIG_SMP
   #ifdef CONFIG_FAIR_GROUP_SCHED
+ 
+ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+ {
+       if (cfs_rq->load.weight)
+               return false;
+ 
+       if (cfs_rq->avg.load_sum)
+               return false;
+ 
+       if (cfs_rq->avg.util_sum)
+               return false;
+ 
+       if (cfs_rq->avg.runnable_sum)
+               return false;
+ 
++      /*
++       * _avg must be null when _sum are null because _avg = _sum / divider
++       * Make sure that rounding and/or propagation of PELT values never
++       * break this.
++       */
++      SCHED_WARN_ON(cfs_rq->avg.load_avg ||
++                    cfs_rq->avg.util_avg ||
++                    cfs_rq->avg.runnable_avg);
++
+       return true;
+ }
+ 
   /**
    * update_tg_load_avg - update the tg's load avg
    * @cfs_rq: the cfs_rq whose avg changed
@@@ -3502,12 -3566,9 +3529,12 @@@ update_tg_cfs_load(struct cfs_rq *cfs_r
         load_sum = (s64)se_weight(se) * runnable_sum;
         load_avg = div_s64(load_sum, divider);
   
+ +      se->avg.load_sum = runnable_sum;
+ +
         delta = load_avg - se->avg.load_avg;
+ +      if (!delta)
+ +              return;
   
- -      se->avg.load_sum = runnable_sum;
         se->avg.load_avg = load_avg;
   
         add_positive(&cfs_rq->avg.load_avg, delta);
@@@ -3864,7 -3925,7 +3891,7 @@@ static inline unsigned long _task_util_
   {
         struct util_est ue = READ_ONCE(p->se.avg.util_est);
   
-       return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
+       return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
   }
   
   static inline unsigned long task_util_est(struct task_struct *p)
@@@ -3964,7 -4025,7 +3991,7 @@@ static inline void util_est_update(stru
          * Reset EWMA on utilization increases, the moving average is used only
          * to smooth utilization decreases.
          */
-       ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+       ue.enqueued = task_util(p);
         if (sched_feat(UTIL_EST_FASTUP)) {
                 if (ue.ewma < ue.enqueued) {
                         ue.ewma = ue.enqueued;
@@@ -4013,6 -4074,7 +4040,7 @@@
         ue.ewma  += last_ewma_diff;
         ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
   done:
+       ue.enqueued |= UTIL_AVG_UNCHANGED;
         WRITE_ONCE(p->se.avg.util_est, ue);
   
         trace_sched_util_est_se_tp(&p->se);
@@@ -4047,6 -4109,11 +4075,11 @@@ static inline void update_misfit_status
   
   #else /* CONFIG_SMP */
   
+ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+ {
+       return true;
+ }
+ 
   #define UPDATE_TG     0x0
   #define SKIP_AGE_LOAD 0x0
   #define DO_ATTACH     0x0
@@@ -4381,8 -4448,6 +4414,8 @@@ check_preempt_tick(struct cfs_rq *cfs_r
   static void
   set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
+ +      clear_buddies(cfs_rq, se);
+ +
         /* 'current' is not kept within the tree. */
         if (se->on_rq) {
                 /*
@@@ -4442,7 -4507,7 +4475,7 @@@ pick_next_entity(struct cfs_rq *cfs_rq
          * Avoid running the skip buddy, if running something else can
          * be done without getting too unfair.
          */
- -      if (cfs_rq->skip == se) {
+ +      if (cfs_rq->skip && cfs_rq->skip == se) {
                 struct sched_entity *second;
   
                 if (se == curr) {
@@@ -4469,6 -4534,8 +4502,6 @@@
                 se = cfs_rq->last;
         }
   
- -      clear_buddies(cfs_rq, se);
- -
         return se;
   }
   
@@@ -4705,8 -4772,8 +4738,8 @@@ static int tg_unthrottle_up(struct task
                 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
                                              cfs_rq->throttled_clock_task;
   
-               /* Add cfs_rq with already running entity in the list */
-               if (cfs_rq->nr_running >= 1)
+               /* Add cfs_rq with load or one or more already running entities to the list */
+               if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
                         list_add_leaf_cfs_rq(cfs_rq);
         }
   
@@@ -5290,7 -5357,7 +5323,7 @@@ static void __maybe_unused update_runti
   {
         struct task_group *tg;
   
- -      lockdep_assert_held(&rq->lock);
+ +      lockdep_assert_rq_held(rq);
   
         rcu_read_lock();
         list_for_each_entry_rcu(tg, &task_groups, list) {
@@@ -5309,7 -5376,7 +5342,7 @@@ static void __maybe_unused unthrottle_o
   {
         struct task_group *tg;
   
- -      lockdep_assert_held(&rq->lock);
+ +      lockdep_assert_rq_held(rq);
   
         rcu_read_lock();
         list_for_each_entry_rcu(tg, &task_groups, list) {
@@@ -5897,15 -5964,11 +5930,15 @@@ find_idlest_group_cpu(struct sched_grou
   
         /* Traverse only the allowed CPUs */
         for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+ +              struct rq *rq = cpu_rq(i);
+ +
+ +              if (!sched_core_cookie_match(rq, p))
+ +                      continue;
+ +
                 if (sched_idle_cpu(i))
                         return i;
   
                 if (available_idle_cpu(i)) {
- -                      struct rq *rq = cpu_rq(i);
                         struct cpuidle_state *idle = idle_get_state(rq);
                         if (idle && idle->exit_latency < min_exit_latency) {
                                 /*
@@@ -5991,10 -6054,9 +6024,10 @@@ static inline int find_idlest_cpu(struc
         return new_cpu;
   }
   
- -static inline int __select_idle_cpu(int cpu)
+ +static inline int __select_idle_cpu(int cpu, struct task_struct *p)
   {
- -      if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+ +      if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
+ +          sched_cpu_cookie_match(cpu_rq(cpu), p))
                 return cpu;
   
         return -1;
@@@ -6064,7 -6126,7 +6097,7 @@@ static int select_idle_core(struct task
         int cpu;
   
         if (!static_branch_likely(&sched_smt_present))
- -              return __select_idle_cpu(core);
+ +              return __select_idle_cpu(core, p);
   
         for_each_cpu(cpu, cpu_smt_mask(core)) {
                 if (!available_idle_cpu(cpu)) {
@@@ -6120,7 -6182,7 +6153,7 @@@ static inline bool test_idle_cores(int 
   
   static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
   {
- -      return __select_idle_cpu(core);
+ +      return __select_idle_cpu(core, p);
   }
   
   static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
@@@ -6139,10 -6201,9 +6172,10 @@@ static int select_idle_cpu(struct task_
   {
         struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
         int i, cpu, idle_cpu = -1, nr = INT_MAX;
+ +      struct rq *this_rq = this_rq();
         int this = smp_processor_id();
         struct sched_domain *this_sd;
- -      u64 time;
+ +      u64 time = 0;
   
         this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
         if (!this_sd)
@@@ -6152,21 -6213,12 +6185,21 @@@
   
         if (sched_feat(SIS_PROP) && !has_idle_core) {
                 u64 avg_cost, avg_idle, span_avg;
+ +              unsigned long now = jiffies;
   
                 /*
- -               * Due to large variance we need a large fuzz factor;
- -               * hackbench in particularly is sensitive here.
+ +               * If we're busy, the assumption that the last idle period
+ +               * predicts the future is flawed; age away the remaining
+ +               * predicted idle time.
                  */
- -              avg_idle = this_rq()->avg_idle / 512;
+ +              if (unlikely(this_rq->wake_stamp < now)) {
+ +                      while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
+ +                              this_rq->wake_stamp++;
+ +                              this_rq->wake_avg_idle >>= 1;
+ +                      }
+ +              }
+ +
+ +              avg_idle = this_rq->wake_avg_idle;
                 avg_cost = this_sd->avg_scan_cost + 1;
   
                 span_avg = sd->span_weight * avg_idle;
@@@ -6187,7 -6239,7 +6220,7 @@@
                 } else {
                         if (!--nr)
                                 return -1;
- -                      idle_cpu = __select_idle_cpu(cpu);
+ +                      idle_cpu = __select_idle_cpu(cpu, p);
                         if ((unsigned int)idle_cpu < nr_cpumask_bits)
                                 break;
                 }
@@@ -6198,13 -6250,6 +6231,13 @@@
   
         if (sched_feat(SIS_PROP) && !has_idle_core) {
                 time = cpu_clock(this) - time;
+ +
+ +              /*
+ +               * Account for the scan cost of wakeups against the average
+ +               * idle time.
+ +               */
+ +              this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
+ +
                 update_avg(&this_sd->avg_scan_cost, time);
         }
   
@@@ -6272,11 -6317,6 +6305,11 @@@ static int select_idle_sibling(struct t
                 task_util = uclamp_task_util(p);
         }
   
+ +      /*
+ +       * per-cpu select_idle_mask usage
+ +       */
+ +      lockdep_assert_irqs_disabled();
+ +
         if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
             asym_fits_capacity(task_util, target))
                 return target;
@@@ -6552,11 -6592,8 +6585,11 @@@ compute_energy(struct task_struct *p, i
         struct cpumask *pd_mask = perf_domain_span(pd);
         unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
         unsigned long max_util = 0, sum_util = 0;
+ +      unsigned long _cpu_cap = cpu_cap;
         int cpu;
   
+ +      _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
+ +
         /*
          * The capacity state of CPUs of the current rd can be driven by CPUs
          * of another rd if they belong to the same pd. So, account for the
@@@ -6592,10 -6629,8 +6625,10 @@@
                  * is already enough to scale the EM reported power
                  * consumption at the (eventually clamped) cpu_capacity.
                  */
- -              sum_util += effective_cpu_util(cpu, util_running, cpu_cap,
- -                                             ENERGY_UTIL, NULL);
+ +              cpu_util = effective_cpu_util(cpu, util_running, cpu_cap,
+ +                                            ENERGY_UTIL, NULL);
+ +
+ +              sum_util += min(cpu_util, _cpu_cap);
   
                 /*
                  * Performance domain frequency: utilization clamping
@@@ -6606,10 -6641,10 +6639,10 @@@
                  */
                 cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
                                               FREQUENCY_UTIL, tsk);
- -              max_util = max(max_util, cpu_util);
+ +              max_util = max(max_util, min(cpu_util, _cpu_cap));
         }
   
- -      return em_cpu_energy(pd->em_pd, max_util, sum_util);
+ +      return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
   }
   
   /*
@@@ -6655,15 -6690,15 +6688,15 @@@ static int find_energy_efficient_cpu(st
   {
         unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
         struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ +      int cpu, best_energy_cpu = prev_cpu, target = -1;
         unsigned long cpu_cap, util, base_energy = 0;
- -      int cpu, best_energy_cpu = prev_cpu;
         struct sched_domain *sd;
         struct perf_domain *pd;
   
         rcu_read_lock();
         pd = rcu_dereference(rd->pd);
         if (!pd || READ_ONCE(rd->overutilized))
- -              goto fail;
+ +              goto unlock;
   
         /*
          * Energy-aware wake-up happens on the lowest sched_domain starting
@@@ -6673,9 -6708,7 +6706,9 @@@
         while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
                 sd = sd->parent;
         if (!sd)
- -              goto fail;
+ +              goto unlock;
+ +
+ +      target = prev_cpu;
   
         sync_entity_load_avg(&p->se);
         if (!task_util_est(p))
@@@ -6683,10 -6716,13 +6716,10 @@@
   
         for (; pd; pd = pd->next) {
                 unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+ +              bool compute_prev_delta = false;
                 unsigned long base_energy_pd;
                 int max_spare_cap_cpu = -1;
   
- -              /* Compute the 'base' energy of the pd, without @p */
- -              base_energy_pd = compute_energy(p, -1, pd);
- -              base_energy += base_energy_pd;
- -
                 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
                         if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                                 continue;
@@@ -6707,40 -6743,26 +6740,40 @@@
                         if (!fits_capacity(util, cpu_cap))
                                 continue;
   
- -                      /* Always use prev_cpu as a candidate. */
                         if (cpu == prev_cpu) {
- -                              prev_delta = compute_energy(p, prev_cpu, pd);
- -                              prev_delta -= base_energy_pd;
- -                              best_delta = min(best_delta, prev_delta);
- -                      }
- -
- -                      /*
- -                       * Find the CPU with the maximum spare capacity in
- -                       * the performance domain
- -                       */
- -                      if (spare_cap > max_spare_cap) {
+ +                              /* Always use prev_cpu as a candidate. */
+ +                              compute_prev_delta = true;
+ +                      } else if (spare_cap > max_spare_cap) {
+ +                              /*
+ +                               * Find the CPU with the maximum spare capacity
+ +                               * in the performance domain.
+ +                               */
                                 max_spare_cap = spare_cap;
                                 max_spare_cap_cpu = cpu;
                         }
                 }
   
- -              /* Evaluate the energy impact of using this CPU. */
- -              if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
+ +              if (max_spare_cap_cpu < 0 && !compute_prev_delta)
+ +                      continue;
+ +
+ +              /* Compute the 'base' energy of the pd, without @p */
+ +              base_energy_pd = compute_energy(p, -1, pd);
+ +              base_energy += base_energy_pd;
+ +
+ +              /* Evaluate the energy impact of using prev_cpu. */
+ +              if (compute_prev_delta) {
+ +                      prev_delta = compute_energy(p, prev_cpu, pd);
+ +                      if (prev_delta < base_energy_pd)
+ +                              goto unlock;
+ +                      prev_delta -= base_energy_pd;
+ +                      best_delta = min(best_delta, prev_delta);
+ +              }
+ +
+ +              /* Evaluate the energy impact of using max_spare_cap_cpu. */
+ +              if (max_spare_cap_cpu >= 0) {
                         cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
+ +                      if (cur_delta < base_energy_pd)
+ +                              goto unlock;
                         cur_delta -= base_energy_pd;
                         if (cur_delta < best_delta) {
                                 best_delta = cur_delta;
@@@ -6748,22 -6770,25 +6781,22 @@@
                         }
                 }
         }
- -unlock:
         rcu_read_unlock();
   
         /*
          * Pick the best CPU if prev_cpu cannot be used, or if it saves at
          * least 6% of the energy used by prev_cpu.
          */
- -      if (prev_delta == ULONG_MAX)
- -              return best_energy_cpu;
- -
- -      if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
- -              return best_energy_cpu;
+ +      if ((prev_delta == ULONG_MAX) ||
+ +          (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
+ +              target = best_energy_cpu;
   
- -      return prev_cpu;
+ +      return target;
   
- -fail:
+ +unlock:
         rcu_read_unlock();
   
- -      return -1;
+ +      return target;
   }
   
   /*
@@@ -6775,6 -6800,8 +6808,6 @@@
    * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
    *
    * Returns the target CPU number.
- - *
- - * preempt must be disabled.
    */
   static int
   select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
@@@ -6787,10 -6814,6 +6820,10 @@@
         /* SD_flags and WF_flags share the first nibble */
         int sd_flag = wake_flags & 0xF;
   
+ +      /*
+ +       * required for stable ->cpus_allowed
+ +       */
+ +      lockdep_assert_held(&p->pi_lock);
         if (wake_flags & WF_TTWU) {
                 record_wakee(p);
   
@@@ -6880,7 -6903,7 +6913,7 @@@ static void migrate_task_rq_fair(struc
                  * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
                  * rq->lock and can modify state directly.
                  */
- -              lockdep_assert_held(&task_rq(p)->lock);
+ +              lockdep_assert_rq_held(task_rq(p));
                 detach_entity_cfs_rq(&p->se);
   
         } else {
@@@ -7084,39 -7107,6 +7117,39 @@@ preempt
                 set_last_buddy(se);
   }
   
+ +#ifdef CONFIG_SMP
+ +static struct task_struct *pick_task_fair(struct rq *rq)
+ +{
+ +      struct sched_entity *se;
+ +      struct cfs_rq *cfs_rq;
+ +
+ +again:
+ +      cfs_rq = &rq->cfs;
+ +      if (!cfs_rq->nr_running)
+ +              return NULL;
+ +
+ +      do {
+ +              struct sched_entity *curr = cfs_rq->curr;
+ +
+ +              /* When we pick for a remote RQ, we'll not have done put_prev_entity() */
+ +              if (curr) {
+ +                      if (curr->on_rq)
+ +                              update_curr(cfs_rq);
+ +                      else
+ +                              curr = NULL;
+ +
+ +                      if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ +                              goto again;
+ +              }
+ +
+ +              se = pick_next_entity(cfs_rq, curr);
+ +              cfs_rq = group_cfs_rq(se);
+ +      } while (cfs_rq);
+ +
+ +      return task_of(se);
+ +}
+ +#endif
+ +
   struct task_struct *
   pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
   {
@@@ -7540,7 -7530,7 +7573,7 @@@ static int task_hot(struct task_struct 
   {
         s64 delta;
   
- -      lockdep_assert_held(&env->src_rq->lock);
+ +      lockdep_assert_rq_held(env->src_rq);
   
         if (p->sched_class != &fair_sched_class)
                 return 0;
@@@ -7562,14 -7552,6 +7595,14 @@@
   
         if (sysctl_sched_migration_cost == -1)
                 return 1;
+ +
+ +      /*
+ +       * Don't migrate task if the task's cookie does not match
+ +       * with the destination CPU's core cookie.
+ +       */
+ +      if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
+ +              return 1;
+ +
         if (sysctl_sched_migration_cost == 0)
                 return 0;
   
@@@ -7646,7 -7628,7 +7679,7 @@@ int can_migrate_task(struct task_struc
   {
         int tsk_cache_hot;
   
- -      lockdep_assert_held(&env->src_rq->lock);
+ +      lockdep_assert_rq_held(env->src_rq);
   
         /*
          * We do not migrate tasks that are:
@@@ -7735,7 -7717,7 +7768,7 @@@
    */
   static void detach_task(struct task_struct *p, struct lb_env *env)
   {
- -      lockdep_assert_held(&env->src_rq->lock);
+ +      lockdep_assert_rq_held(env->src_rq);
   
         deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
         set_task_cpu(p, env->dst_cpu);
@@@ -7751,7 -7733,7 +7784,7 @@@ static struct task_struct *detach_one_t
   {
         struct task_struct *p;
   
- -      lockdep_assert_held(&env->src_rq->lock);
+ +      lockdep_assert_rq_held(env->src_rq);
   
         list_for_each_entry_reverse(p,
                         &env->src_rq->cfs_tasks, se.group_node) {
@@@ -7787,7 -7769,7 +7820,7 @@@ static int detach_tasks(struct lb_env *
         struct task_struct *p;
         int detached = 0;
   
- -      lockdep_assert_held(&env->src_rq->lock);
+ +      lockdep_assert_rq_held(env->src_rq);
   
         /*
          * Source run queue has been emptied by another CPU, clear
@@@ -7917,7 -7899,7 +7950,7 @@@ next
    */
   static void attach_task(struct rq *rq, struct task_struct *p)
   {
- -      lockdep_assert_held(&rq->lock);
+ +      lockdep_assert_rq_held(rq);
   
         BUG_ON(task_rq(p) != rq);
         activate_task(rq, p, ENQUEUE_NOCLOCK);
@@@ -8037,32 -8019,6 +8070,6 @@@ static bool __update_blocked_others(str
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
   
- static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
- {
-       if (cfs_rq->load.weight)
-               return false;
- 
-       if (cfs_rq->avg.load_sum)
-               return false;
- 
-       if (cfs_rq->avg.util_sum)
-               return false;
- 
-       if (cfs_rq->avg.runnable_sum)
-               return false;
- 
-       /*
-        * _avg must be null when _sum are null because _avg = _sum / divider
-        * Make sure that rounding and/or propagation of PELT values never
-        * break this.
-        */
-       SCHED_WARN_ON(cfs_rq->avg.load_avg ||
-                     cfs_rq->avg.util_avg ||
-                     cfs_rq->avg.runnable_avg);
- 
-       return true;
- }
- 
   static bool __update_blocked_fair(struct rq *rq, bool *done)
   {
         struct cfs_rq *cfs_rq, *pos;
@@@ -8909,10 -8865,6 +8916,10 @@@ find_idlest_group(struct sched_domain *
                                         p->cpus_ptr))
                         continue;
   
+ +              /* Skip over this group if no cookie matched */
+ +              if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
+ +                      continue;
+ +
                 local_group = cpumask_test_cpu(this_cpu,
                                                sched_group_span(group));
   
@@@ -9841,7 -9793,7 +9848,7 @@@ more_balance
                 if (need_active_balance(&env)) {
                         unsigned long flags;
   
- -                      raw_spin_lock_irqsave(&busiest->lock, flags);
+ +                      raw_spin_rq_lock_irqsave(busiest, flags);
   
                         /*
                          * Don't kick the active_load_balance_cpu_stop,
@@@ -9849,7 -9801,8 +9856,7 @@@
                          * moved to this_cpu:
                          */
                         if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
- -                              raw_spin_unlock_irqrestore(&busiest->lock,
- -                                                          flags);
+ +                              raw_spin_rq_unlock_irqrestore(busiest, flags);
                                 goto out_one_pinned;
                         }
   
@@@ -9866,7 -9819,7 +9873,7 @@@
                                 busiest->push_cpu = this_cpu;
                                 active_balance = 1;
                         }
- -                      raw_spin_unlock_irqrestore(&busiest->lock, flags);
+ +                      raw_spin_rq_unlock_irqrestore(busiest, flags);
   
                         if (active_balance) {
                                 stop_one_cpu_nowait(cpu_of(busiest),
@@@ -10651,14 -10604,6 +10658,14 @@@ static int newidle_balance(struct rq *t
         u64 curr_cost = 0;
   
         update_misfit_status(NULL, this_rq);
+ +
+ +      /*
+ +       * There is a task waiting to run. No need to search for one.
+ +       * Return 0; the task will be enqueued when switching to idle.
+ +       */
+ +      if (this_rq->ttwu_pending)
+ +              return 0;
+ +
         /*
          * We must set idle_stamp _before_ calling idle_balance(), such that we
          * measure the duration of idle_balance() as idle time.
@@@ -10691,7 -10636,7 +10698,7 @@@
                 goto out;
         }
   
- -      raw_spin_unlock(&this_rq->lock);
+ +      raw_spin_rq_unlock(this_rq);
   
         update_blocked_averages(this_cpu);
         rcu_read_lock();
@@@ -10724,13 -10669,12 +10731,13 @@@
                  * Stop searching for tasks to pull if there are
                  * now runnable tasks on this rq.
                  */
- -              if (pulled_task || this_rq->nr_running > 0)
+ +              if (pulled_task || this_rq->nr_running > 0 ||
+ +                  this_rq->ttwu_pending)
                         break;
         }
         rcu_read_unlock();
   
- -      raw_spin_lock(&this_rq->lock);
+ +      raw_spin_rq_lock(this_rq);
   
         if (curr_cost > this_rq->max_idle_balance_cost)
                 this_rq->max_idle_balance_cost = curr_cost;
@@@ -10823,119 -10767,6 +10830,119 @@@ static void rq_offline_fair(struct rq *
   
   #endif /* CONFIG_SMP */
   
+ +#ifdef CONFIG_SCHED_CORE
+ +static inline bool
+ +__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
+ +{
+ +      u64 slice = sched_slice(cfs_rq_of(se), se);
+ +      u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ +
+ +      return (rtime * min_nr_tasks > slice);
+ +}
+ +
+ +#define MIN_NR_TASKS_DURING_FORCEIDLE 2
+ +static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
+ +{
+ +      if (!sched_core_enabled(rq))
+ +              return;
+ +
+ +      /*
+ +       * If runqueue has only one task which used up its slice and
+ +       * if the sibling is forced idle, then trigger schedule to
+ +       * give forced idle task a chance.
+ +       *
+ +       * sched_slice() considers only this active rq and it gets the
+ +       * whole slice. But during force idle, we have siblings acting
+ +       * like a single runqueue and hence we need to consider runnable
+ +       * tasks on this CPU and the forced idle CPU. Ideally, we should
+ +       * go through the forced idle rq, but that would be a perf hit.
+ +       * We can assume that the forced idle CPU has at least
+ +       * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
+ +       * if we need to give up the CPU.
+ +       */
+ +      if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
+ +          __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
+ +              resched_curr(rq);
+ +}
+ +
+ +/*
+ + * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
+ + */
+ +static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
+ +{
+ +      for_each_sched_entity(se) {
+ +              struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ +
+ +              if (forceidle) {
+ +                      if (cfs_rq->forceidle_seq == fi_seq)
+ +                              break;
+ +                      cfs_rq->forceidle_seq = fi_seq;
+ +              }
+ +
+ +              cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+ +      }
+ +}
+ +
+ +void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
+ +{
+ +      struct sched_entity *se = &p->se;
+ +
+ +      if (p->sched_class != &fair_sched_class)
+ +              return;
+ +
+ +      se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
+ +}
+ +
+ +bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+ +{
+ +      struct rq *rq = task_rq(a);
+ +      struct sched_entity *sea = &a->se;
+ +      struct sched_entity *seb = &b->se;
+ +      struct cfs_rq *cfs_rqa;
+ +      struct cfs_rq *cfs_rqb;
+ +      s64 delta;
+ +
+ +      SCHED_WARN_ON(task_rq(b)->core != rq->core);
+ +
+ +#ifdef CONFIG_FAIR_GROUP_SCHED
+ +      /*
+ +       * Find an se in the hierarchy for tasks a and b, such that the se's
+ +       * are immediate siblings.
+ +       */
+ +      while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
+ +              int sea_depth = sea->depth;
+ +              int seb_depth = seb->depth;
+ +
+ +              if (sea_depth >= seb_depth)
+ +                      sea = parent_entity(sea);
+ +              if (sea_depth <= seb_depth)
+ +                      seb = parent_entity(seb);
+ +      }
+ +
+ +      se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
+ +      se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
+ +
+ +      cfs_rqa = sea->cfs_rq;
+ +      cfs_rqb = seb->cfs_rq;
+ +#else
+ +      cfs_rqa = &task_rq(a)->cfs;
+ +      cfs_rqb = &task_rq(b)->cfs;
+ +#endif
+ +
+ +      /*
+ +       * Find delta after normalizing se's vruntime with its cfs_rq's
+ +       * min_vruntime_fi, which would have been updated in prior calls
+ +       * to se_fi_update().
+ +       */
+ +      delta = (s64)(sea->vruntime - seb->vruntime) +
+ +              (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+ +
+ +      return delta > 0;
+ +}
+ +#else
+ +static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
+ +#endif
+ +
   /*
    * scheduler tick hitting a task of our scheduling class.
    *
@@@ -10959,8 -10790,6 +10966,8 @@@ static void task_tick_fair(struct rq *r
   
         update_misfit_status(curr, rq);
         update_overutilized_status(task_rq(curr));
+ +
+ +      task_tick_core(rq, curr);
   }
   
   /*
@@@ -11332,9 -11161,9 +11339,9 @@@ void unregister_fair_sched_group(struc
   
                 rq = cpu_rq(cpu);
   
- -              raw_spin_lock_irqsave(&rq->lock, flags);
+ +              raw_spin_rq_lock_irqsave(rq, flags);
                 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- -              raw_spin_unlock_irqrestore(&rq->lock, flags);
+ +              raw_spin_rq_unlock_irqrestore(rq, flags);
         }
   }
   
@@@ -11456,7 -11285,6 +11463,7 @@@ DEFINE_SCHED_CLASS(fair) = 
   
   #ifdef CONFIG_SMP
         .balance                = balance_fair,
+ +      .pick_task              = pick_task_fair,
         .select_task_rq         = select_task_rq_fair,
         .migrate_task_rq        = migrate_task_rq_fair,
   
diff --combined kernel/sched/pelt.h

index 9ed6d8c,cfe94ff..e06071b
--- 1/kernel/sched/pelt.h
--- 2/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@@ -42,15 -42,6 +42,6 @@@ static inline u32 get_pelt_divider(stru
         return LOAD_AVG_MAX - 1024 + avg->period_contrib;
   }
   
- /*
-  * When a task is dequeued, its estimated utilization should not be update if
-  * its util_avg has not been updated at least once.
-  * This flag is used to synchronize util_avg updates with util_est updates.
-  * We map this information into the LSB bit of the utilization saved at
-  * dequeue time (i.e. util_est.dequeued).
-  */
- #define UTIL_AVG_UNCHANGED 0x1
- 
   static inline void cfs_se_util_change(struct sched_avg *avg)
   {
         unsigned int enqueued;
@@@ -58,7 -49,7 +49,7 @@@
         if (!sched_feat(UTIL_EST))
                 return;
   
-       /* Avoid store if the flag has been already set */
+       /* Avoid store if the flag has been already reset */
         enqueued = avg->util_est.enqueued;
         if (!(enqueued & UTIL_AVG_UNCHANGED))
                 return;
@@@ -141,7 -132,7 +132,7 @@@ static inline void update_idle_rq_clock
   
   static inline u64 rq_clock_pelt(struct rq *rq)
   {
- -      lockdep_assert_held(&rq->lock);
+ +      lockdep_assert_rq_held(rq);
         assert_clock_updated(rq);
   
         return rq->clock_pelt - rq->lost_idle_time;
author	Ingo Molnar <mingo@kernel.org>
	Fri, 18 Jun 2021 09:31:25 +0000 (11:31 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Fri, 18 Jun 2021 09:31:25 +0000 (11:31 +0200)
		1	2
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/debug.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/fair.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/pelt.h	patch \|	diff1 \|	diff2 \|	blob \| history