Merge branch 'sched/urgent' into sched/core, to resolve conflicts
authorIngo Molnar <mingo@kernel.org>
Fri, 18 Jun 2021 09:31:25 +0000 (11:31 +0200)
committerIngo Molnar <mingo@kernel.org>
Fri, 18 Jun 2021 09:31:25 +0000 (11:31 +0200)
This commit in sched/urgent moved the cfs_rq_is_decayed() function:

  a7b359fc6a37: ("sched/fair: Correctly insert cfs_rq's to list on unthrottle")

and this fresh commit in sched/core modified it in the old location:

  9e077b52d86a: ("sched/pelt: Check that *_avg are null when *_sum are")

Merge the two variants.

Conflicts:
kernel/sched/fair.c

Signed-off-by: Ingo Molnar <mingo@kernel.org>
1  2 
include/linux/sched.h
init/main.c
kernel/sched/debug.c
kernel/sched/fair.c
kernel/sched/pelt.h

diff --combined include/linux/sched.h
@@@ -350,11 -350,19 +350,19 @@@ struct load_weight 
   * Only for tasks we track a moving average of the past instantaneous
   * estimated utilization. This allows to absorb sporadic drops in utilization
   * of an otherwise almost periodic task.
+  *
+  * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
+  * updates. When a task is dequeued, its util_est should not be updated if its
+  * util_avg has not been updated in the meantime.
+  * This information is mapped into the MSB bit of util_est.enqueued at dequeue
+  * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
+  * for a task) it is safe to use MSB.
   */
  struct util_est {
        unsigned int                    enqueued;
        unsigned int                    ewma;
  #define UTIL_EST_WEIGHT_SHIFT         2
+ #define UTIL_AVG_UNCHANGED            0x80000000
  } __attribute__((__aligned__(sizeof(u64))));
  
  /*
@@@ -700,17 -708,10 +708,17 @@@ struct task_struct 
        const struct sched_class        *sched_class;
        struct sched_entity             se;
        struct sched_rt_entity          rt;
 +      struct sched_dl_entity          dl;
 +
 +#ifdef CONFIG_SCHED_CORE
 +      struct rb_node                  core_node;
 +      unsigned long                   core_cookie;
 +      unsigned int                    core_occupation;
 +#endif
 +
  #ifdef CONFIG_CGROUP_SCHED
        struct task_group               *sched_task_group;
  #endif
 -      struct sched_dl_entity          dl;
  
  #ifdef CONFIG_UCLAMP_TASK
        /*
@@@ -2179,14 -2180,4 +2187,14 @@@ int sched_trace_rq_nr_running(struct r
  
  const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
  
 +#ifdef CONFIG_SCHED_CORE
 +extern void sched_core_free(struct task_struct *tsk);
 +extern void sched_core_fork(struct task_struct *p);
 +extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
 +                              unsigned long uaddr);
 +#else
 +static inline void sched_core_free(struct task_struct *tsk) { }
 +static inline void sched_core_fork(struct task_struct *p) { }
 +#endif
 +
  #endif
diff --combined init/main.c
@@@ -692,7 -692,6 +692,7 @@@ noinline void __ref rest_init(void
         */
        rcu_read_lock();
        tsk = find_task_by_pid_ns(pid, &init_pid_ns);
 +      tsk->flags |= PF_NO_SETAFFINITY;
        set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
        rcu_read_unlock();
  
@@@ -942,7 -941,11 +942,7 @@@ asmlinkage __visible void __init __no_s
         * time - but meanwhile we still have a functioning scheduler.
         */
        sched_init();
 -      /*
 -       * Disable preemption - early bootup scheduling is extremely
 -       * fragile until we cpu_idle() for the first time.
 -       */
 -      preempt_disable();
 +
        if (WARN(!irqs_disabled(),
                 "Interrupts were enabled *very* early, fixing it\n"))
                local_irq_disable();
@@@ -1441,11 -1444,6 +1441,11 @@@ static int __ref kernel_init(void *unus
  {
        int ret;
  
 +      /*
 +       * Wait until kthreadd is all set-up.
 +       */
 +      wait_for_completion(&kthreadd_done);
 +
        kernel_init_freeable();
        /* need to finish all async __init code before freeing the memory */
        async_synchronize_full();
@@@ -1526,6 -1524,11 +1526,6 @@@ void __init console_on_rootfs(void
  
  static noinline void __init kernel_init_freeable(void)
  {
 -      /*
 -       * Wait until kthreadd is all set-up.
 -       */
 -      wait_for_completion(&kthreadd_done);
 -
        /* Now the scheduler is fully set up and can do blocking allocations */
        gfp_allowed_mask = __GFP_BITS_MASK;
  
         */
        set_mems_allowed(node_states[N_MEMORY]);
  
-       cad_pid = task_pid(current);
+       cad_pid = get_pid(task_pid(current));
  
        smp_prepare_cpus(setup_max_cpus);
  
diff --combined kernel/sched/debug.c
@@@ -576,7 -576,7 +576,7 @@@ void print_cfs_rq(struct seq_file *m, i
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
                        SPLIT_NS(cfs_rq->exec_clock));
  
 -      raw_spin_lock_irqsave(&rq->lock, flags);
 +      raw_spin_rq_lock_irqsave(rq, flags);
        if (rb_first_cached(&cfs_rq->tasks_timeline))
                MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
        last = __pick_last_entity(cfs_rq);
                max_vruntime = last->vruntime;
        min_vruntime = cfs_rq->min_vruntime;
        rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
 -      raw_spin_unlock_irqrestore(&rq->lock, flags);
 +      raw_spin_rq_unlock_irqrestore(rq, flags);
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
                        SPLIT_NS(MIN_vruntime));
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime",
@@@ -885,6 -885,7 +885,7 @@@ static const struct seq_operations sche
  #define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
  #define __P(F) __PS(#F, F)
  #define   P(F) __PS(#F, p->F)
+ #define   PM(F, M) __PS(#F, p->F & (M))
  #define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
  #define __PN(F) __PSN(#F, F)
  #define   PN(F) __PSN(#F, p->F)
@@@ -1011,7 -1012,7 +1012,7 @@@ void proc_sched_show_task(struct task_s
        P(se.avg.util_avg);
        P(se.avg.last_update_time);
        P(se.avg.util_est.ewma);
-       P(se.avg.util_est.enqueued);
+       PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
  #endif
  #ifdef CONFIG_UCLAMP_TASK
        __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
diff --combined kernel/sched/fair.c
@@@ -268,11 -268,33 +268,11 @@@ const struct sched_class fair_sched_cla
   */
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
 -static inline struct task_struct *task_of(struct sched_entity *se)
 -{
 -      SCHED_WARN_ON(!entity_is_task(se));
 -      return container_of(se, struct task_struct, se);
 -}
  
  /* Walk up scheduling entities hierarchy */
  #define for_each_sched_entity(se) \
                for (; se; se = se->parent)
  
 -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 -{
 -      return p->se.cfs_rq;
 -}
 -
 -/* runqueue on which this entity is (to be) queued */
 -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 -{
 -      return se->cfs_rq;
 -}
 -
 -/* runqueue "owned" by this group */
 -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 -{
 -      return grp->my_q;
 -}
 -
  static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
  {
        if (!path)
@@@ -433,9 -455,33 +433,9 @@@ find_matching_se(struct sched_entity **
  
  #else /* !CONFIG_FAIR_GROUP_SCHED */
  
 -static inline struct task_struct *task_of(struct sched_entity *se)
 -{
 -      return container_of(se, struct task_struct, se);
 -}
 -
  #define for_each_sched_entity(se) \
                for (; se; se = NULL)
  
 -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 -{
 -      return &task_rq(p)->cfs;
 -}
 -
 -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 -{
 -      struct task_struct *p = task_of(se);
 -      struct rq *rq = task_rq(p);
 -
 -      return &rq->cfs;
 -}
 -
 -/* runqueue "owned" by this group */
 -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 -{
 -      return NULL;
 -}
 -
  static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
  {
        if (path)
@@@ -1061,7 -1107,7 +1061,7 @@@ struct numa_group 
  static struct numa_group *deref_task_numa_group(struct task_struct *p)
  {
        return rcu_dereference_check(p->numa_group, p == current ||
 -              (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
 +              (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
  }
  
  static struct numa_group *deref_curr_numa_group(struct task_struct *p)
@@@ -3093,7 -3139,7 +3093,7 @@@ void reweight_task(struct task_struct *
   *
   *                     tg->weight * grq->load.weight
   *   ge->load.weight = -----------------------------               (1)
 - *                      \Sum grq->load.weight
 + *                       \Sum grq->load.weight
   *
   * Now, because computing that sum is prohibitively expensive to compute (been
   * there, done that) we approximate it with this average stuff. The average
   *
   *                     tg->weight * grq->avg.load_avg
   *   ge->load.weight = ------------------------------              (3)
 - *                            tg->load_avg
 + *                             tg->load_avg
   *
   * Where: tg->load_avg ~= \Sum grq->avg.load_avg
   *
   *
   *                     tg->weight * grq->load.weight
   *   ge->load.weight = ----------------------------- = tg->weight   (4)
 - *                        grp->load.weight
 + *                         grp->load.weight
   *
   * That is, the sum collapses because all other CPUs are idle; the UP scenario.
   *
   *
   *                     tg->weight * grq->load.weight
   *   ge->load.weight = -----------------------------             (6)
 - *                            tg_load_avg'
 + *                             tg_load_avg'
   *
   * Where:
   *
@@@ -3252,6 -3298,24 +3252,33 @@@ static inline void cfs_rq_util_change(s
  
  #ifdef CONFIG_SMP
  #ifdef CONFIG_FAIR_GROUP_SCHED
+ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+ {
+       if (cfs_rq->load.weight)
+               return false;
+       if (cfs_rq->avg.load_sum)
+               return false;
+       if (cfs_rq->avg.util_sum)
+               return false;
+       if (cfs_rq->avg.runnable_sum)
+               return false;
++      /*
++       * _avg must be null when _sum are null because _avg = _sum / divider
++       * Make sure that rounding and/or propagation of PELT values never
++       * break this.
++       */
++      SCHED_WARN_ON(cfs_rq->avg.load_avg ||
++                    cfs_rq->avg.util_avg ||
++                    cfs_rq->avg.runnable_avg);
++
+       return true;
+ }
  /**
   * update_tg_load_avg - update the tg's load avg
   * @cfs_rq: the cfs_rq whose avg changed
@@@ -3502,12 -3566,9 +3529,12 @@@ update_tg_cfs_load(struct cfs_rq *cfs_r
        load_sum = (s64)se_weight(se) * runnable_sum;
        load_avg = div_s64(load_sum, divider);
  
 +      se->avg.load_sum = runnable_sum;
 +
        delta = load_avg - se->avg.load_avg;
 +      if (!delta)
 +              return;
  
 -      se->avg.load_sum = runnable_sum;
        se->avg.load_avg = load_avg;
  
        add_positive(&cfs_rq->avg.load_avg, delta);
@@@ -3864,7 -3925,7 +3891,7 @@@ static inline unsigned long _task_util_
  {
        struct util_est ue = READ_ONCE(p->se.avg.util_est);
  
-       return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
+       return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
  }
  
  static inline unsigned long task_util_est(struct task_struct *p)
@@@ -3964,7 -4025,7 +3991,7 @@@ static inline void util_est_update(stru
         * Reset EWMA on utilization increases, the moving average is used only
         * to smooth utilization decreases.
         */
-       ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+       ue.enqueued = task_util(p);
        if (sched_feat(UTIL_EST_FASTUP)) {
                if (ue.ewma < ue.enqueued) {
                        ue.ewma = ue.enqueued;
        ue.ewma  += last_ewma_diff;
        ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
  done:
+       ue.enqueued |= UTIL_AVG_UNCHANGED;
        WRITE_ONCE(p->se.avg.util_est, ue);
  
        trace_sched_util_est_se_tp(&p->se);
@@@ -4047,6 -4109,11 +4075,11 @@@ static inline void update_misfit_status
  
  #else /* CONFIG_SMP */
  
+ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+ {
+       return true;
+ }
  #define UPDATE_TG     0x0
  #define SKIP_AGE_LOAD 0x0
  #define DO_ATTACH     0x0
@@@ -4381,8 -4448,6 +4414,8 @@@ check_preempt_tick(struct cfs_rq *cfs_r
  static void
  set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
 +      clear_buddies(cfs_rq, se);
 +
        /* 'current' is not kept within the tree. */
        if (se->on_rq) {
                /*
@@@ -4442,7 -4507,7 +4475,7 @@@ pick_next_entity(struct cfs_rq *cfs_rq
         * Avoid running the skip buddy, if running something else can
         * be done without getting too unfair.
         */
 -      if (cfs_rq->skip == se) {
 +      if (cfs_rq->skip && cfs_rq->skip == se) {
                struct sched_entity *second;
  
                if (se == curr) {
                se = cfs_rq->last;
        }
  
 -      clear_buddies(cfs_rq, se);
 -
        return se;
  }
  
@@@ -4705,8 -4772,8 +4738,8 @@@ static int tg_unthrottle_up(struct task
                cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
                                             cfs_rq->throttled_clock_task;
  
-               /* Add cfs_rq with already running entity in the list */
-               if (cfs_rq->nr_running >= 1)
+               /* Add cfs_rq with load or one or more already running entities to the list */
+               if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
                        list_add_leaf_cfs_rq(cfs_rq);
        }
  
@@@ -5290,7 -5357,7 +5323,7 @@@ static void __maybe_unused update_runti
  {
        struct task_group *tg;
  
 -      lockdep_assert_held(&rq->lock);
 +      lockdep_assert_rq_held(rq);
  
        rcu_read_lock();
        list_for_each_entry_rcu(tg, &task_groups, list) {
@@@ -5309,7 -5376,7 +5342,7 @@@ static void __maybe_unused unthrottle_o
  {
        struct task_group *tg;
  
 -      lockdep_assert_held(&rq->lock);
 +      lockdep_assert_rq_held(rq);
  
        rcu_read_lock();
        list_for_each_entry_rcu(tg, &task_groups, list) {
@@@ -5897,15 -5964,11 +5930,15 @@@ find_idlest_group_cpu(struct sched_grou
  
        /* Traverse only the allowed CPUs */
        for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
 +              struct rq *rq = cpu_rq(i);
 +
 +              if (!sched_core_cookie_match(rq, p))
 +                      continue;
 +
                if (sched_idle_cpu(i))
                        return i;
  
                if (available_idle_cpu(i)) {
 -                      struct rq *rq = cpu_rq(i);
                        struct cpuidle_state *idle = idle_get_state(rq);
                        if (idle && idle->exit_latency < min_exit_latency) {
                                /*
@@@ -5991,10 -6054,9 +6024,10 @@@ static inline int find_idlest_cpu(struc
        return new_cpu;
  }
  
 -static inline int __select_idle_cpu(int cpu)
 +static inline int __select_idle_cpu(int cpu, struct task_struct *p)
  {
 -      if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
 +      if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
 +          sched_cpu_cookie_match(cpu_rq(cpu), p))
                return cpu;
  
        return -1;
@@@ -6064,7 -6126,7 +6097,7 @@@ static int select_idle_core(struct task
        int cpu;
  
        if (!static_branch_likely(&sched_smt_present))
 -              return __select_idle_cpu(core);
 +              return __select_idle_cpu(core, p);
  
        for_each_cpu(cpu, cpu_smt_mask(core)) {
                if (!available_idle_cpu(cpu)) {
@@@ -6120,7 -6182,7 +6153,7 @@@ static inline bool test_idle_cores(int 
  
  static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
  {
 -      return __select_idle_cpu(core);
 +      return __select_idle_cpu(core, p);
  }
  
  static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
@@@ -6139,10 -6201,9 +6172,10 @@@ static int select_idle_cpu(struct task_
  {
        struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
        int i, cpu, idle_cpu = -1, nr = INT_MAX;
 +      struct rq *this_rq = this_rq();
        int this = smp_processor_id();
        struct sched_domain *this_sd;
 -      u64 time;
 +      u64 time = 0;
  
        this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
        if (!this_sd)
  
        if (sched_feat(SIS_PROP) && !has_idle_core) {
                u64 avg_cost, avg_idle, span_avg;
 +              unsigned long now = jiffies;
  
                /*
 -               * Due to large variance we need a large fuzz factor;
 -               * hackbench in particularly is sensitive here.
 +               * If we're busy, the assumption that the last idle period
 +               * predicts the future is flawed; age away the remaining
 +               * predicted idle time.
                 */
 -              avg_idle = this_rq()->avg_idle / 512;
 +              if (unlikely(this_rq->wake_stamp < now)) {
 +                      while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
 +                              this_rq->wake_stamp++;
 +                              this_rq->wake_avg_idle >>= 1;
 +                      }
 +              }
 +
 +              avg_idle = this_rq->wake_avg_idle;
                avg_cost = this_sd->avg_scan_cost + 1;
  
                span_avg = sd->span_weight * avg_idle;
                } else {
                        if (!--nr)
                                return -1;
 -                      idle_cpu = __select_idle_cpu(cpu);
 +                      idle_cpu = __select_idle_cpu(cpu, p);
                        if ((unsigned int)idle_cpu < nr_cpumask_bits)
                                break;
                }
  
        if (sched_feat(SIS_PROP) && !has_idle_core) {
                time = cpu_clock(this) - time;
 +
 +              /*
 +               * Account for the scan cost of wakeups against the average
 +               * idle time.
 +               */
 +              this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
 +
                update_avg(&this_sd->avg_scan_cost, time);
        }
  
@@@ -6272,11 -6317,6 +6305,11 @@@ static int select_idle_sibling(struct t
                task_util = uclamp_task_util(p);
        }
  
 +      /*
 +       * per-cpu select_idle_mask usage
 +       */
 +      lockdep_assert_irqs_disabled();
 +
        if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
            asym_fits_capacity(task_util, target))
                return target;
@@@ -6552,11 -6592,8 +6585,11 @@@ compute_energy(struct task_struct *p, i
        struct cpumask *pd_mask = perf_domain_span(pd);
        unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
        unsigned long max_util = 0, sum_util = 0;
 +      unsigned long _cpu_cap = cpu_cap;
        int cpu;
  
 +      _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
 +
        /*
         * The capacity state of CPUs of the current rd can be driven by CPUs
         * of another rd if they belong to the same pd. So, account for the
                 * is already enough to scale the EM reported power
                 * consumption at the (eventually clamped) cpu_capacity.
                 */
 -              sum_util += effective_cpu_util(cpu, util_running, cpu_cap,
 -                                             ENERGY_UTIL, NULL);
 +              cpu_util = effective_cpu_util(cpu, util_running, cpu_cap,
 +                                            ENERGY_UTIL, NULL);
 +
 +              sum_util += min(cpu_util, _cpu_cap);
  
                /*
                 * Performance domain frequency: utilization clamping
                 */
                cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
                                              FREQUENCY_UTIL, tsk);
 -              max_util = max(max_util, cpu_util);
 +              max_util = max(max_util, min(cpu_util, _cpu_cap));
        }
  
 -      return em_cpu_energy(pd->em_pd, max_util, sum_util);
 +      return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
  }
  
  /*
@@@ -6655,15 -6690,15 +6688,15 @@@ static int find_energy_efficient_cpu(st
  {
        unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
 +      int cpu, best_energy_cpu = prev_cpu, target = -1;
        unsigned long cpu_cap, util, base_energy = 0;
 -      int cpu, best_energy_cpu = prev_cpu;
        struct sched_domain *sd;
        struct perf_domain *pd;
  
        rcu_read_lock();
        pd = rcu_dereference(rd->pd);
        if (!pd || READ_ONCE(rd->overutilized))
 -              goto fail;
 +              goto unlock;
  
        /*
         * Energy-aware wake-up happens on the lowest sched_domain starting
        while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
                sd = sd->parent;
        if (!sd)
 -              goto fail;
 +              goto unlock;
 +
 +      target = prev_cpu;
  
        sync_entity_load_avg(&p->se);
        if (!task_util_est(p))
  
        for (; pd; pd = pd->next) {
                unsigned long cur_delta, spare_cap, max_spare_cap = 0;
 +              bool compute_prev_delta = false;
                unsigned long base_energy_pd;
                int max_spare_cap_cpu = -1;
  
 -              /* Compute the 'base' energy of the pd, without @p */
 -              base_energy_pd = compute_energy(p, -1, pd);
 -              base_energy += base_energy_pd;
 -
                for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
                        if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                                continue;
                        if (!fits_capacity(util, cpu_cap))
                                continue;
  
 -                      /* Always use prev_cpu as a candidate. */
                        if (cpu == prev_cpu) {
 -                              prev_delta = compute_energy(p, prev_cpu, pd);
 -                              prev_delta -= base_energy_pd;
 -                              best_delta = min(best_delta, prev_delta);
 -                      }
 -
 -                      /*
 -                       * Find the CPU with the maximum spare capacity in
 -                       * the performance domain
 -                       */
 -                      if (spare_cap > max_spare_cap) {
 +                              /* Always use prev_cpu as a candidate. */
 +                              compute_prev_delta = true;
 +                      } else if (spare_cap > max_spare_cap) {
 +                              /*
 +                               * Find the CPU with the maximum spare capacity
 +                               * in the performance domain.
 +                               */
                                max_spare_cap = spare_cap;
                                max_spare_cap_cpu = cpu;
                        }
                }
  
 -              /* Evaluate the energy impact of using this CPU. */
 -              if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
 +              if (max_spare_cap_cpu < 0 && !compute_prev_delta)
 +                      continue;
 +
 +              /* Compute the 'base' energy of the pd, without @p */
 +              base_energy_pd = compute_energy(p, -1, pd);
 +              base_energy += base_energy_pd;
 +
 +              /* Evaluate the energy impact of using prev_cpu. */
 +              if (compute_prev_delta) {
 +                      prev_delta = compute_energy(p, prev_cpu, pd);
 +                      if (prev_delta < base_energy_pd)
 +                              goto unlock;
 +                      prev_delta -= base_energy_pd;
 +                      best_delta = min(best_delta, prev_delta);
 +              }
 +
 +              /* Evaluate the energy impact of using max_spare_cap_cpu. */
 +              if (max_spare_cap_cpu >= 0) {
                        cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
 +                      if (cur_delta < base_energy_pd)
 +                              goto unlock;
                        cur_delta -= base_energy_pd;
                        if (cur_delta < best_delta) {
                                best_delta = cur_delta;
                        }
                }
        }
 -unlock:
        rcu_read_unlock();
  
        /*
         * Pick the best CPU if prev_cpu cannot be used, or if it saves at
         * least 6% of the energy used by prev_cpu.
         */
 -      if (prev_delta == ULONG_MAX)
 -              return best_energy_cpu;
 -
 -      if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
 -              return best_energy_cpu;
 +      if ((prev_delta == ULONG_MAX) ||
 +          (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
 +              target = best_energy_cpu;
  
 -      return prev_cpu;
 +      return target;
  
 -fail:
 +unlock:
        rcu_read_unlock();
  
 -      return -1;
 +      return target;
  }
  
  /*
   * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
   *
   * Returns the target CPU number.
 - *
 - * preempt must be disabled.
   */
  static int
  select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
        /* SD_flags and WF_flags share the first nibble */
        int sd_flag = wake_flags & 0xF;
  
 +      /*
 +       * required for stable ->cpus_allowed
 +       */
 +      lockdep_assert_held(&p->pi_lock);
        if (wake_flags & WF_TTWU) {
                record_wakee(p);
  
@@@ -6880,7 -6903,7 +6913,7 @@@ static void migrate_task_rq_fair(struc
                 * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
                 * rq->lock and can modify state directly.
                 */
 -              lockdep_assert_held(&task_rq(p)->lock);
 +              lockdep_assert_rq_held(task_rq(p));
                detach_entity_cfs_rq(&p->se);
  
        } else {
@@@ -7084,39 -7107,6 +7117,39 @@@ preempt
                set_last_buddy(se);
  }
  
 +#ifdef CONFIG_SMP
 +static struct task_struct *pick_task_fair(struct rq *rq)
 +{
 +      struct sched_entity *se;
 +      struct cfs_rq *cfs_rq;
 +
 +again:
 +      cfs_rq = &rq->cfs;
 +      if (!cfs_rq->nr_running)
 +              return NULL;
 +
 +      do {
 +              struct sched_entity *curr = cfs_rq->curr;
 +
 +              /* When we pick for a remote RQ, we'll not have done put_prev_entity() */
 +              if (curr) {
 +                      if (curr->on_rq)
 +                              update_curr(cfs_rq);
 +                      else
 +                              curr = NULL;
 +
 +                      if (unlikely(check_cfs_rq_runtime(cfs_rq)))
 +                              goto again;
 +              }
 +
 +              se = pick_next_entity(cfs_rq, curr);
 +              cfs_rq = group_cfs_rq(se);
 +      } while (cfs_rq);
 +
 +      return task_of(se);
 +}
 +#endif
 +
  struct task_struct *
  pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  {
@@@ -7540,7 -7530,7 +7573,7 @@@ static int task_hot(struct task_struct 
  {
        s64 delta;
  
 -      lockdep_assert_held(&env->src_rq->lock);
 +      lockdep_assert_rq_held(env->src_rq);
  
        if (p->sched_class != &fair_sched_class)
                return 0;
  
        if (sysctl_sched_migration_cost == -1)
                return 1;
 +
 +      /*
 +       * Don't migrate task if the task's cookie does not match
 +       * with the destination CPU's core cookie.
 +       */
 +      if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
 +              return 1;
 +
        if (sysctl_sched_migration_cost == 0)
                return 0;
  
@@@ -7646,7 -7628,7 +7679,7 @@@ int can_migrate_task(struct task_struc
  {
        int tsk_cache_hot;
  
 -      lockdep_assert_held(&env->src_rq->lock);
 +      lockdep_assert_rq_held(env->src_rq);
  
        /*
         * We do not migrate tasks that are:
   */
  static void detach_task(struct task_struct *p, struct lb_env *env)
  {
 -      lockdep_assert_held(&env->src_rq->lock);
 +      lockdep_assert_rq_held(env->src_rq);
  
        deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
        set_task_cpu(p, env->dst_cpu);
@@@ -7751,7 -7733,7 +7784,7 @@@ static struct task_struct *detach_one_t
  {
        struct task_struct *p;
  
 -      lockdep_assert_held(&env->src_rq->lock);
 +      lockdep_assert_rq_held(env->src_rq);
  
        list_for_each_entry_reverse(p,
                        &env->src_rq->cfs_tasks, se.group_node) {
@@@ -7787,7 -7769,7 +7820,7 @@@ static int detach_tasks(struct lb_env *
        struct task_struct *p;
        int detached = 0;
  
 -      lockdep_assert_held(&env->src_rq->lock);
 +      lockdep_assert_rq_held(env->src_rq);
  
        /*
         * Source run queue has been emptied by another CPU, clear
@@@ -7917,7 -7899,7 +7950,7 @@@ next
   */
  static void attach_task(struct rq *rq, struct task_struct *p)
  {
 -      lockdep_assert_held(&rq->lock);
 +      lockdep_assert_rq_held(rq);
  
        BUG_ON(task_rq(p) != rq);
        activate_task(rq, p, ENQUEUE_NOCLOCK);
@@@ -8037,32 -8019,6 +8070,6 @@@ static bool __update_blocked_others(str
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
- static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
- {
-       if (cfs_rq->load.weight)
-               return false;
-       if (cfs_rq->avg.load_sum)
-               return false;
-       if (cfs_rq->avg.util_sum)
-               return false;
-       if (cfs_rq->avg.runnable_sum)
-               return false;
-       /*
-        * _avg must be null when _sum are null because _avg = _sum / divider
-        * Make sure that rounding and/or propagation of PELT values never
-        * break this.
-        */
-       SCHED_WARN_ON(cfs_rq->avg.load_avg ||
-                     cfs_rq->avg.util_avg ||
-                     cfs_rq->avg.runnable_avg);
-       return true;
- }
  static bool __update_blocked_fair(struct rq *rq, bool *done)
  {
        struct cfs_rq *cfs_rq, *pos;
@@@ -8909,10 -8865,6 +8916,10 @@@ find_idlest_group(struct sched_domain *
                                        p->cpus_ptr))
                        continue;
  
 +              /* Skip over this group if no cookie matched */
 +              if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
 +                      continue;
 +
                local_group = cpumask_test_cpu(this_cpu,
                                               sched_group_span(group));
  
@@@ -9841,7 -9793,7 +9848,7 @@@ more_balance
                if (need_active_balance(&env)) {
                        unsigned long flags;
  
 -                      raw_spin_lock_irqsave(&busiest->lock, flags);
 +                      raw_spin_rq_lock_irqsave(busiest, flags);
  
                        /*
                         * Don't kick the active_load_balance_cpu_stop,
                         * moved to this_cpu:
                         */
                        if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
 -                              raw_spin_unlock_irqrestore(&busiest->lock,
 -                                                          flags);
 +                              raw_spin_rq_unlock_irqrestore(busiest, flags);
                                goto out_one_pinned;
                        }
  
                                busiest->push_cpu = this_cpu;
                                active_balance = 1;
                        }
 -                      raw_spin_unlock_irqrestore(&busiest->lock, flags);
 +                      raw_spin_rq_unlock_irqrestore(busiest, flags);
  
                        if (active_balance) {
                                stop_one_cpu_nowait(cpu_of(busiest),
@@@ -10651,14 -10604,6 +10658,14 @@@ static int newidle_balance(struct rq *t
        u64 curr_cost = 0;
  
        update_misfit_status(NULL, this_rq);
 +
 +      /*
 +       * There is a task waiting to run. No need to search for one.
 +       * Return 0; the task will be enqueued when switching to idle.
 +       */
 +      if (this_rq->ttwu_pending)
 +              return 0;
 +
        /*
         * We must set idle_stamp _before_ calling idle_balance(), such that we
         * measure the duration of idle_balance() as idle time.
                goto out;
        }
  
 -      raw_spin_unlock(&this_rq->lock);
 +      raw_spin_rq_unlock(this_rq);
  
        update_blocked_averages(this_cpu);
        rcu_read_lock();
                 * Stop searching for tasks to pull if there are
                 * now runnable tasks on this rq.
                 */
 -              if (pulled_task || this_rq->nr_running > 0)
 +              if (pulled_task || this_rq->nr_running > 0 ||
 +                  this_rq->ttwu_pending)
                        break;
        }
        rcu_read_unlock();
  
 -      raw_spin_lock(&this_rq->lock);
 +      raw_spin_rq_lock(this_rq);
  
        if (curr_cost > this_rq->max_idle_balance_cost)
                this_rq->max_idle_balance_cost = curr_cost;
@@@ -10823,119 -10767,6 +10830,119 @@@ static void rq_offline_fair(struct rq *
  
  #endif /* CONFIG_SMP */
  
 +#ifdef CONFIG_SCHED_CORE
 +static inline bool
 +__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
 +{
 +      u64 slice = sched_slice(cfs_rq_of(se), se);
 +      u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
 +
 +      return (rtime * min_nr_tasks > slice);
 +}
 +
 +#define MIN_NR_TASKS_DURING_FORCEIDLE 2
 +static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
 +{
 +      if (!sched_core_enabled(rq))
 +              return;
 +
 +      /*
 +       * If runqueue has only one task which used up its slice and
 +       * if the sibling is forced idle, then trigger schedule to
 +       * give forced idle task a chance.
 +       *
 +       * sched_slice() considers only this active rq and it gets the
 +       * whole slice. But during force idle, we have siblings acting
 +       * like a single runqueue and hence we need to consider runnable
 +       * tasks on this CPU and the forced idle CPU. Ideally, we should
 +       * go through the forced idle rq, but that would be a perf hit.
 +       * We can assume that the forced idle CPU has at least
 +       * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
 +       * if we need to give up the CPU.
 +       */
 +      if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
 +          __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
 +              resched_curr(rq);
 +}
 +
 +/*
 + * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
 + */
 +static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
 +{
 +      for_each_sched_entity(se) {
 +              struct cfs_rq *cfs_rq = cfs_rq_of(se);
 +
 +              if (forceidle) {
 +                      if (cfs_rq->forceidle_seq == fi_seq)
 +                              break;
 +                      cfs_rq->forceidle_seq = fi_seq;
 +              }
 +
 +              cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
 +      }
 +}
 +
 +void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
 +{
 +      struct sched_entity *se = &p->se;
 +
 +      if (p->sched_class != &fair_sched_class)
 +              return;
 +
 +      se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
 +}
 +
 +bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
 +{
 +      struct rq *rq = task_rq(a);
 +      struct sched_entity *sea = &a->se;
 +      struct sched_entity *seb = &b->se;
 +      struct cfs_rq *cfs_rqa;
 +      struct cfs_rq *cfs_rqb;
 +      s64 delta;
 +
 +      SCHED_WARN_ON(task_rq(b)->core != rq->core);
 +
 +#ifdef CONFIG_FAIR_GROUP_SCHED
 +      /*
 +       * Find an se in the hierarchy for tasks a and b, such that the se's
 +       * are immediate siblings.
 +       */
 +      while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
 +              int sea_depth = sea->depth;
 +              int seb_depth = seb->depth;
 +
 +              if (sea_depth >= seb_depth)
 +                      sea = parent_entity(sea);
 +              if (sea_depth <= seb_depth)
 +                      seb = parent_entity(seb);
 +      }
 +
 +      se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
 +      se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
 +
 +      cfs_rqa = sea->cfs_rq;
 +      cfs_rqb = seb->cfs_rq;
 +#else
 +      cfs_rqa = &task_rq(a)->cfs;
 +      cfs_rqb = &task_rq(b)->cfs;
 +#endif
 +
 +      /*
 +       * Find delta after normalizing se's vruntime with its cfs_rq's
 +       * min_vruntime_fi, which would have been updated in prior calls
 +       * to se_fi_update().
 +       */
 +      delta = (s64)(sea->vruntime - seb->vruntime) +
 +              (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
 +
 +      return delta > 0;
 +}
 +#else
 +static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
 +#endif
 +
  /*
   * scheduler tick hitting a task of our scheduling class.
   *
@@@ -10959,8 -10790,6 +10966,8 @@@ static void task_tick_fair(struct rq *r
  
        update_misfit_status(curr, rq);
        update_overutilized_status(task_rq(curr));
 +
 +      task_tick_core(rq, curr);
  }
  
  /*
@@@ -11332,9 -11161,9 +11339,9 @@@ void unregister_fair_sched_group(struc
  
                rq = cpu_rq(cpu);
  
 -              raw_spin_lock_irqsave(&rq->lock, flags);
 +              raw_spin_rq_lock_irqsave(rq, flags);
                list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
 -              raw_spin_unlock_irqrestore(&rq->lock, flags);
 +              raw_spin_rq_unlock_irqrestore(rq, flags);
        }
  }
  
@@@ -11456,7 -11285,6 +11463,7 @@@ DEFINE_SCHED_CLASS(fair) = 
  
  #ifdef CONFIG_SMP
        .balance                = balance_fair,
 +      .pick_task              = pick_task_fair,
        .select_task_rq         = select_task_rq_fair,
        .migrate_task_rq        = migrate_task_rq_fair,
  
diff --combined kernel/sched/pelt.h
@@@ -42,15 -42,6 +42,6 @@@ static inline u32 get_pelt_divider(stru
        return LOAD_AVG_MAX - 1024 + avg->period_contrib;
  }
  
- /*
-  * When a task is dequeued, its estimated utilization should not be update if
-  * its util_avg has not been updated at least once.
-  * This flag is used to synchronize util_avg updates with util_est updates.
-  * We map this information into the LSB bit of the utilization saved at
-  * dequeue time (i.e. util_est.dequeued).
-  */
- #define UTIL_AVG_UNCHANGED 0x1
  static inline void cfs_se_util_change(struct sched_avg *avg)
  {
        unsigned int enqueued;
@@@ -58,7 -49,7 +49,7 @@@
        if (!sched_feat(UTIL_EST))
                return;
  
-       /* Avoid store if the flag has been already set */
+       /* Avoid store if the flag has been already reset */
        enqueued = avg->util_est.enqueued;
        if (!(enqueued & UTIL_AVG_UNCHANGED))
                return;
@@@ -141,7 -132,7 +132,7 @@@ static inline void update_idle_rq_clock
  
  static inline u64 rq_clock_pelt(struct rq *rq)
  {
 -      lockdep_assert_held(&rq->lock);
 +      lockdep_assert_rq_held(rq);
        assert_clock_updated(rq);
  
        return rq->clock_pelt - rq->lost_idle_time;