Merge tag 'sched-core-2021-06-28' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-microblaze.git] / kernel / sched / fair.c
index 2c8a935..e6d1dd4 100644 (file)
@@ -268,33 +268,11 @@ const struct sched_class fair_sched_class;
  */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
-       SCHED_WARN_ON(!entity_is_task(se));
-       return container_of(se, struct task_struct, se);
-}
 
 /* Walk up scheduling entities hierarchy */
 #define for_each_sched_entity(se) \
                for (; se; se = se->parent)
 
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-       return p->se.cfs_rq;
-}
-
-/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
-       return se->cfs_rq;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
-       return grp->my_q;
-}
-
 static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
 {
        if (!path)
@@ -455,33 +433,9 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 
 #else  /* !CONFIG_FAIR_GROUP_SCHED */
 
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
-       return container_of(se, struct task_struct, se);
-}
-
 #define for_each_sched_entity(se) \
                for (; se; se = NULL)
 
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-       return &task_rq(p)->cfs;
-}
-
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
-       struct task_struct *p = task_of(se);
-       struct rq *rq = task_rq(p);
-
-       return &rq->cfs;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
-       return NULL;
-}
-
 static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
 {
        if (path)
@@ -1039,11 +993,14 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
        if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
                struct task_struct *tsk = task_of(se);
+               unsigned int state;
 
-               if (tsk->state & TASK_INTERRUPTIBLE)
+               /* XXX racy against TTWU */
+               state = READ_ONCE(tsk->__state);
+               if (state & TASK_INTERRUPTIBLE)
                        __schedstat_set(se->statistics.sleep_start,
                                      rq_clock(rq_of(cfs_rq)));
-               if (tsk->state & TASK_UNINTERRUPTIBLE)
+               if (state & TASK_UNINTERRUPTIBLE)
                        __schedstat_set(se->statistics.block_start,
                                      rq_clock(rq_of(cfs_rq)));
        }
@@ -1107,7 +1064,7 @@ struct numa_group {
 static struct numa_group *deref_task_numa_group(struct task_struct *p)
 {
        return rcu_dereference_check(p->numa_group, p == current ||
-               (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
+               (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
 }
 
 static struct numa_group *deref_curr_numa_group(struct task_struct *p)
@@ -3139,7 +3096,7 @@ void reweight_task(struct task_struct *p, int prio)
  *
  *                     tg->weight * grq->load.weight
  *   ge->load.weight = -----------------------------               (1)
- *                       \Sum grq->load.weight
+ *                       \Sum grq->load.weight
  *
  * Now, because computing that sum is prohibitively expensive to compute (been
  * there, done that) we approximate it with this average stuff. The average
@@ -3153,7 +3110,7 @@ void reweight_task(struct task_struct *p, int prio)
  *
  *                     tg->weight * grq->avg.load_avg
  *   ge->load.weight = ------------------------------              (3)
- *                             tg->load_avg
+ *                             tg->load_avg
  *
  * Where: tg->load_avg ~= \Sum grq->avg.load_avg
  *
@@ -3169,7 +3126,7 @@ void reweight_task(struct task_struct *p, int prio)
  *
  *                     tg->weight * grq->load.weight
  *   ge->load.weight = ----------------------------- = tg->weight   (4)
- *                         grp->load.weight
+ *                         grp->load.weight
  *
  * That is, the sum collapses because all other CPUs are idle; the UP scenario.
  *
@@ -3188,7 +3145,7 @@ void reweight_task(struct task_struct *p, int prio)
  *
  *                     tg->weight * grq->load.weight
  *   ge->load.weight = -----------------------------              (6)
- *                             tg_load_avg'
+ *                             tg_load_avg'
  *
  * Where:
  *
@@ -3298,6 +3255,61 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
 
 #ifdef CONFIG_SMP
 #ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
+ * immediately before a parent cfs_rq, and cfs_rqs are removed from the list
+ * bottom-up, we only have to test whether the cfs_rq before us on the list
+ * is our child.
+ * If cfs_rq is not on the list, test whether a child needs its to be added to
+ * connect a branch to the tree  * (see list_add_leaf_cfs_rq() for details).
+ */
+static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
+{
+       struct cfs_rq *prev_cfs_rq;
+       struct list_head *prev;
+
+       if (cfs_rq->on_list) {
+               prev = cfs_rq->leaf_cfs_rq_list.prev;
+       } else {
+               struct rq *rq = rq_of(cfs_rq);
+
+               prev = rq->tmp_alone_branch;
+       }
+
+       prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
+
+       return (prev_cfs_rq->tg->parent == cfs_rq->tg);
+}
+
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->load.weight)
+               return false;
+
+       if (cfs_rq->avg.load_sum)
+               return false;
+
+       if (cfs_rq->avg.util_sum)
+               return false;
+
+       if (cfs_rq->avg.runnable_sum)
+               return false;
+
+       if (child_cfs_rq_on_list(cfs_rq))
+               return false;
+
+       /*
+        * _avg must be null when _sum are null because _avg = _sum / divider
+        * Make sure that rounding and/or propagation of PELT values never
+        * break this.
+        */
+       SCHED_WARN_ON(cfs_rq->avg.load_avg ||
+                     cfs_rq->avg.util_avg ||
+                     cfs_rq->avg.runnable_avg);
+
+       return true;
+}
+
 /**
  * update_tg_load_avg - update the tg's load avg
  * @cfs_rq: the cfs_rq whose avg changed
@@ -3548,9 +3560,12 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
        load_sum = (s64)se_weight(se) * runnable_sum;
        load_avg = div_s64(load_sum, divider);
 
+       se->avg.load_sum = runnable_sum;
+
        delta = load_avg - se->avg.load_avg;
+       if (!delta)
+               return;
 
-       se->avg.load_sum = runnable_sum;
        se->avg.load_avg = load_avg;
 
        add_positive(&cfs_rq->avg.load_avg, delta);
@@ -4091,6 +4106,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
 
 #else /* CONFIG_SMP */
 
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+       return true;
+}
+
 #define UPDATE_TG      0x0
 #define SKIP_AGE_LOAD  0x0
 #define DO_ATTACH      0x0
@@ -4425,6 +4445,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 static void
 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+       clear_buddies(cfs_rq, se);
+
        /* 'current' is not kept within the tree. */
        if (se->on_rq) {
                /*
@@ -4484,7 +4506,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
         * Avoid running the skip buddy, if running something else can
         * be done without getting too unfair.
         */
-       if (cfs_rq->skip == se) {
+       if (cfs_rq->skip && cfs_rq->skip == se) {
                struct sched_entity *second;
 
                if (se == curr) {
@@ -4511,8 +4533,6 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
                se = cfs_rq->last;
        }
 
-       clear_buddies(cfs_rq, se);
-
        return se;
 }
 
@@ -4634,8 +4654,11 @@ static inline u64 sched_cfs_bandwidth_slice(void)
  */
 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
 {
-       if (cfs_b->quota != RUNTIME_INF)
-               cfs_b->runtime = cfs_b->quota;
+       if (unlikely(cfs_b->quota == RUNTIME_INF))
+               return;
+
+       cfs_b->runtime += cfs_b->quota;
+       cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
 }
 
 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4749,8 +4772,8 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
                cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
                                             cfs_rq->throttled_clock_task;
 
-               /* Add cfs_rq with already running entity in the list */
-               if (cfs_rq->nr_running >= 1)
+               /* Add cfs_rq with load or one or more already running entities to the list */
+               if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
                        list_add_leaf_cfs_rq(cfs_rq);
        }
 
@@ -4996,6 +5019,9 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
        throttled = !list_empty(&cfs_b->throttled_cfs_rq);
        cfs_b->nr_periods += overrun;
 
+       /* Refill extra burst quota even if cfs_b->idle */
+       __refill_cfs_bandwidth_runtime(cfs_b);
+
        /*
         * idle depends on !throttled (for the case of a large deficit), and if
         * we're going inactive then everything else can be deferred
@@ -5003,8 +5029,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
        if (cfs_b->idle && !throttled)
                goto out_deactivate;
 
-       __refill_cfs_bandwidth_runtime(cfs_b);
-
        if (!throttled) {
                /* mark as potentially idle for the upcoming period */
                cfs_b->idle = 1;
@@ -5254,6 +5278,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
                        if (new < max_cfs_quota_period) {
                                cfs_b->period = ns_to_ktime(new);
                                cfs_b->quota *= 2;
+                               cfs_b->burst *= 2;
 
                                pr_warn_ratelimited(
        "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
@@ -5285,6 +5310,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
        cfs_b->runtime = 0;
        cfs_b->quota = RUNTIME_INF;
        cfs_b->period = ns_to_ktime(default_cfs_period());
+       cfs_b->burst = 0;
 
        INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
        hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
@@ -5334,7 +5360,7 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq)
 {
        struct task_group *tg;
 
-       lockdep_assert_held(&rq->lock);
+       lockdep_assert_rq_held(rq);
 
        rcu_read_lock();
        list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -5353,7 +5379,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 {
        struct task_group *tg;
 
-       lockdep_assert_held(&rq->lock);
+       lockdep_assert_rq_held(rq);
 
        rcu_read_lock();
        list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -5941,11 +5967,15 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
 
        /* Traverse only the allowed CPUs */
        for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+               struct rq *rq = cpu_rq(i);
+
+               if (!sched_core_cookie_match(rq, p))
+                       continue;
+
                if (sched_idle_cpu(i))
                        return i;
 
                if (available_idle_cpu(i)) {
-                       struct rq *rq = cpu_rq(i);
                        struct cpuidle_state *idle = idle_get_state(rq);
                        if (idle && idle->exit_latency < min_exit_latency) {
                                /*
@@ -6031,9 +6061,10 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
        return new_cpu;
 }
 
-static inline int __select_idle_cpu(int cpu)
+static inline int __select_idle_cpu(int cpu, struct task_struct *p)
 {
-       if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+       if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
+           sched_cpu_cookie_match(cpu_rq(cpu), p))
                return cpu;
 
        return -1;
@@ -6103,7 +6134,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
        int cpu;
 
        if (!static_branch_likely(&sched_smt_present))
-               return __select_idle_cpu(core);
+               return __select_idle_cpu(core, p);
 
        for_each_cpu(cpu, cpu_smt_mask(core)) {
                if (!available_idle_cpu(cpu)) {
@@ -6159,7 +6190,7 @@ static inline bool test_idle_cores(int cpu, bool def)
 
 static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
 {
-       return __select_idle_cpu(core);
+       return __select_idle_cpu(core, p);
 }
 
 static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
@@ -6178,9 +6209,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 {
        struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
        int i, cpu, idle_cpu = -1, nr = INT_MAX;
+       struct rq *this_rq = this_rq();
        int this = smp_processor_id();
        struct sched_domain *this_sd;
-       u64 time;
+       u64 time = 0;
 
        this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
        if (!this_sd)
@@ -6190,12 +6222,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 
        if (sched_feat(SIS_PROP) && !has_idle_core) {
                u64 avg_cost, avg_idle, span_avg;
+               unsigned long now = jiffies;
 
                /*
-                * Due to large variance we need a large fuzz factor;
-                * hackbench in particularly is sensitive here.
+                * If we're busy, the assumption that the last idle period
+                * predicts the future is flawed; age away the remaining
+                * predicted idle time.
                 */
-               avg_idle = this_rq()->avg_idle / 512;
+               if (unlikely(this_rq->wake_stamp < now)) {
+                       while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
+                               this_rq->wake_stamp++;
+                               this_rq->wake_avg_idle >>= 1;
+                       }
+               }
+
+               avg_idle = this_rq->wake_avg_idle;
                avg_cost = this_sd->avg_scan_cost + 1;
 
                span_avg = sd->span_weight * avg_idle;
@@ -6216,7 +6257,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
                } else {
                        if (!--nr)
                                return -1;
-                       idle_cpu = __select_idle_cpu(cpu);
+                       idle_cpu = __select_idle_cpu(cpu, p);
                        if ((unsigned int)idle_cpu < nr_cpumask_bits)
                                break;
                }
@@ -6227,6 +6268,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 
        if (sched_feat(SIS_PROP) && !has_idle_core) {
                time = cpu_clock(this) - time;
+
+               /*
+                * Account for the scan cost of wakeups against the average
+                * idle time.
+                */
+               this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
+
                update_avg(&this_sd->avg_scan_cost, time);
        }
 
@@ -6294,6 +6342,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
                task_util = uclamp_task_util(p);
        }
 
+       /*
+        * per-cpu select_idle_mask usage
+        */
+       lockdep_assert_irqs_disabled();
+
        if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
            asym_fits_capacity(task_util, target))
                return target;
@@ -6569,8 +6622,11 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
        struct cpumask *pd_mask = perf_domain_span(pd);
        unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
        unsigned long max_util = 0, sum_util = 0;
+       unsigned long _cpu_cap = cpu_cap;
        int cpu;
 
+       _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
+
        /*
         * The capacity state of CPUs of the current rd can be driven by CPUs
         * of another rd if they belong to the same pd. So, account for the
@@ -6606,8 +6662,10 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
                 * is already enough to scale the EM reported power
                 * consumption at the (eventually clamped) cpu_capacity.
                 */
-               sum_util += effective_cpu_util(cpu, util_running, cpu_cap,
-                                              ENERGY_UTIL, NULL);
+               cpu_util = effective_cpu_util(cpu, util_running, cpu_cap,
+                                             ENERGY_UTIL, NULL);
+
+               sum_util += min(cpu_util, _cpu_cap);
 
                /*
                 * Performance domain frequency: utilization clamping
@@ -6618,10 +6676,10 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
                 */
                cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
                                              FREQUENCY_UTIL, tsk);
-               max_util = max(max_util, cpu_util);
+               max_util = max(max_util, min(cpu_util, _cpu_cap));
        }
 
-       return em_cpu_energy(pd->em_pd, max_util, sum_util);
+       return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
 }
 
 /*
@@ -6667,15 +6725,15 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 {
        unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+       int cpu, best_energy_cpu = prev_cpu, target = -1;
        unsigned long cpu_cap, util, base_energy = 0;
-       int cpu, best_energy_cpu = prev_cpu;
        struct sched_domain *sd;
        struct perf_domain *pd;
 
        rcu_read_lock();
        pd = rcu_dereference(rd->pd);
        if (!pd || READ_ONCE(rd->overutilized))
-               goto fail;
+               goto unlock;
 
        /*
         * Energy-aware wake-up happens on the lowest sched_domain starting
@@ -6685,7 +6743,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
        while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
                sd = sd->parent;
        if (!sd)
-               goto fail;
+               goto unlock;
+
+       target = prev_cpu;
 
        sync_entity_load_avg(&p->se);
        if (!task_util_est(p))
@@ -6693,13 +6753,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 
        for (; pd; pd = pd->next) {
                unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+               bool compute_prev_delta = false;
                unsigned long base_energy_pd;
                int max_spare_cap_cpu = -1;
 
-               /* Compute the 'base' energy of the pd, without @p */
-               base_energy_pd = compute_energy(p, -1, pd);
-               base_energy += base_energy_pd;
-
                for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
                        if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                                continue;
@@ -6720,26 +6777,40 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                        if (!fits_capacity(util, cpu_cap))
                                continue;
 
-                       /* Always use prev_cpu as a candidate. */
                        if (cpu == prev_cpu) {
-                               prev_delta = compute_energy(p, prev_cpu, pd);
-                               prev_delta -= base_energy_pd;
-                               best_delta = min(best_delta, prev_delta);
-                       }
-
-                       /*
-                        * Find the CPU with the maximum spare capacity in
-                        * the performance domain
-                        */
-                       if (spare_cap > max_spare_cap) {
+                               /* Always use prev_cpu as a candidate. */
+                               compute_prev_delta = true;
+                       } else if (spare_cap > max_spare_cap) {
+                               /*
+                                * Find the CPU with the maximum spare capacity
+                                * in the performance domain.
+                                */
                                max_spare_cap = spare_cap;
                                max_spare_cap_cpu = cpu;
                        }
                }
 
-               /* Evaluate the energy impact of using this CPU. */
-               if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
+               if (max_spare_cap_cpu < 0 && !compute_prev_delta)
+                       continue;
+
+               /* Compute the 'base' energy of the pd, without @p */
+               base_energy_pd = compute_energy(p, -1, pd);
+               base_energy += base_energy_pd;
+
+               /* Evaluate the energy impact of using prev_cpu. */
+               if (compute_prev_delta) {
+                       prev_delta = compute_energy(p, prev_cpu, pd);
+                       if (prev_delta < base_energy_pd)
+                               goto unlock;
+                       prev_delta -= base_energy_pd;
+                       best_delta = min(best_delta, prev_delta);
+               }
+
+               /* Evaluate the energy impact of using max_spare_cap_cpu. */
+               if (max_spare_cap_cpu >= 0) {
                        cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
+                       if (cur_delta < base_energy_pd)
+                               goto unlock;
                        cur_delta -= base_energy_pd;
                        if (cur_delta < best_delta) {
                                best_delta = cur_delta;
@@ -6747,25 +6818,22 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                        }
                }
        }
-unlock:
        rcu_read_unlock();
 
        /*
         * Pick the best CPU if prev_cpu cannot be used, or if it saves at
         * least 6% of the energy used by prev_cpu.
         */
-       if (prev_delta == ULONG_MAX)
-               return best_energy_cpu;
-
-       if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
-               return best_energy_cpu;
+       if ((prev_delta == ULONG_MAX) ||
+           (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
+               target = best_energy_cpu;
 
-       return prev_cpu;
+       return target;
 
-fail:
+unlock:
        rcu_read_unlock();
 
-       return -1;
+       return target;
 }
 
 /*
@@ -6777,8 +6845,6 @@ fail:
  * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
  *
  * Returns the target CPU number.
- *
- * preempt must be disabled.
  */
 static int
 select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
@@ -6791,6 +6857,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
        /* SD_flags and WF_flags share the first nibble */
        int sd_flag = wake_flags & 0xF;
 
+       /*
+        * required for stable ->cpus_allowed
+        */
+       lockdep_assert_held(&p->pi_lock);
        if (wake_flags & WF_TTWU) {
                record_wakee(p);
 
@@ -6855,7 +6925,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
         * min_vruntime -- the latter is done by enqueue_entity() when placing
         * the task on the new runqueue.
         */
-       if (p->state == TASK_WAKING) {
+       if (READ_ONCE(p->__state) == TASK_WAKING) {
                struct sched_entity *se = &p->se;
                struct cfs_rq *cfs_rq = cfs_rq_of(se);
                u64 min_vruntime;
@@ -6880,7 +6950,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
                 * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
                 * rq->lock and can modify state directly.
                 */
-               lockdep_assert_held(&task_rq(p)->lock);
+               lockdep_assert_rq_held(task_rq(p));
                detach_entity_cfs_rq(&p->se);
 
        } else {
@@ -7084,6 +7154,39 @@ preempt:
                set_last_buddy(se);
 }
 
+#ifdef CONFIG_SMP
+static struct task_struct *pick_task_fair(struct rq *rq)
+{
+       struct sched_entity *se;
+       struct cfs_rq *cfs_rq;
+
+again:
+       cfs_rq = &rq->cfs;
+       if (!cfs_rq->nr_running)
+               return NULL;
+
+       do {
+               struct sched_entity *curr = cfs_rq->curr;
+
+               /* When we pick for a remote RQ, we'll not have done put_prev_entity() */
+               if (curr) {
+                       if (curr->on_rq)
+                               update_curr(cfs_rq);
+                       else
+                               curr = NULL;
+
+                       if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+                               goto again;
+               }
+
+               se = pick_next_entity(cfs_rq, curr);
+               cfs_rq = group_cfs_rq(se);
+       } while (cfs_rq);
+
+       return task_of(se);
+}
+#endif
+
 struct task_struct *
 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
@@ -7507,7 +7610,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 {
        s64 delta;
 
-       lockdep_assert_held(&env->src_rq->lock);
+       lockdep_assert_rq_held(env->src_rq);
 
        if (p->sched_class != &fair_sched_class)
                return 0;
@@ -7529,6 +7632,14 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 
        if (sysctl_sched_migration_cost == -1)
                return 1;
+
+       /*
+        * Don't migrate task if the task's cookie does not match
+        * with the destination CPU's core cookie.
+        */
+       if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
+               return 1;
+
        if (sysctl_sched_migration_cost == 0)
                return 0;
 
@@ -7605,7 +7716,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
        int tsk_cache_hot;
 
-       lockdep_assert_held(&env->src_rq->lock);
+       lockdep_assert_rq_held(env->src_rq);
 
        /*
         * We do not migrate tasks that are:
@@ -7694,7 +7805,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
  */
 static void detach_task(struct task_struct *p, struct lb_env *env)
 {
-       lockdep_assert_held(&env->src_rq->lock);
+       lockdep_assert_rq_held(env->src_rq);
 
        deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
        set_task_cpu(p, env->dst_cpu);
@@ -7710,7 +7821,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
 {
        struct task_struct *p;
 
-       lockdep_assert_held(&env->src_rq->lock);
+       lockdep_assert_rq_held(env->src_rq);
 
        list_for_each_entry_reverse(p,
                        &env->src_rq->cfs_tasks, se.group_node) {
@@ -7746,7 +7857,7 @@ static int detach_tasks(struct lb_env *env)
        struct task_struct *p;
        int detached = 0;
 
-       lockdep_assert_held(&env->src_rq->lock);
+       lockdep_assert_rq_held(env->src_rq);
 
        /*
         * Source run queue has been emptied by another CPU, clear
@@ -7876,7 +7987,7 @@ next:
  */
 static void attach_task(struct rq *rq, struct task_struct *p)
 {
-       lockdep_assert_held(&rq->lock);
+       lockdep_assert_rq_held(rq);
 
        BUG_ON(task_rq(p) != rq);
        activate_task(rq, p, ENQUEUE_NOCLOCK);
@@ -7996,23 +8107,6 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
-{
-       if (cfs_rq->load.weight)
-               return false;
-
-       if (cfs_rq->avg.load_sum)
-               return false;
-
-       if (cfs_rq->avg.util_sum)
-               return false;
-
-       if (cfs_rq->avg.runnable_sum)
-               return false;
-
-       return true;
-}
-
 static bool __update_blocked_fair(struct rq *rq, bool *done)
 {
        struct cfs_rq *cfs_rq, *pos;
@@ -8859,6 +8953,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                                        p->cpus_ptr))
                        continue;
 
+               /* Skip over this group if no cookie matched */
+               if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
+                       continue;
+
                local_group = cpumask_test_cpu(this_cpu,
                                               sched_group_span(group));
 
@@ -9787,7 +9885,7 @@ more_balance:
                if (need_active_balance(&env)) {
                        unsigned long flags;
 
-                       raw_spin_lock_irqsave(&busiest->lock, flags);
+                       raw_spin_rq_lock_irqsave(busiest, flags);
 
                        /*
                         * Don't kick the active_load_balance_cpu_stop,
@@ -9795,8 +9893,7 @@ more_balance:
                         * moved to this_cpu:
                         */
                        if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
-                               raw_spin_unlock_irqrestore(&busiest->lock,
-                                                           flags);
+                               raw_spin_rq_unlock_irqrestore(busiest, flags);
                                goto out_one_pinned;
                        }
 
@@ -9813,7 +9910,7 @@ more_balance:
                                busiest->push_cpu = this_cpu;
                                active_balance = 1;
                        }
-                       raw_spin_unlock_irqrestore(&busiest->lock, flags);
+                       raw_spin_rq_unlock_irqrestore(busiest, flags);
 
                        if (active_balance) {
                                stop_one_cpu_nowait(cpu_of(busiest),
@@ -10598,6 +10695,14 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
        u64 curr_cost = 0;
 
        update_misfit_status(NULL, this_rq);
+
+       /*
+        * There is a task waiting to run. No need to search for one.
+        * Return 0; the task will be enqueued when switching to idle.
+        */
+       if (this_rq->ttwu_pending)
+               return 0;
+
        /*
         * We must set idle_stamp _before_ calling idle_balance(), such that we
         * measure the duration of idle_balance() as idle time.
@@ -10630,7 +10735,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
                goto out;
        }
 
-       raw_spin_unlock(&this_rq->lock);
+       raw_spin_rq_unlock(this_rq);
 
        update_blocked_averages(this_cpu);
        rcu_read_lock();
@@ -10663,12 +10768,13 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
                 * Stop searching for tasks to pull if there are
                 * now runnable tasks on this rq.
                 */
-               if (pulled_task || this_rq->nr_running > 0)
+               if (pulled_task || this_rq->nr_running > 0 ||
+                   this_rq->ttwu_pending)
                        break;
        }
        rcu_read_unlock();
 
-       raw_spin_lock(&this_rq->lock);
+       raw_spin_rq_lock(this_rq);
 
        if (curr_cost > this_rq->max_idle_balance_cost)
                this_rq->max_idle_balance_cost = curr_cost;
@@ -10761,6 +10867,119 @@ static void rq_offline_fair(struct rq *rq)
 
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_SCHED_CORE
+static inline bool
+__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
+{
+       u64 slice = sched_slice(cfs_rq_of(se), se);
+       u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+
+       return (rtime * min_nr_tasks > slice);
+}
+
+#define MIN_NR_TASKS_DURING_FORCEIDLE  2
+static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
+{
+       if (!sched_core_enabled(rq))
+               return;
+
+       /*
+        * If runqueue has only one task which used up its slice and
+        * if the sibling is forced idle, then trigger schedule to
+        * give forced idle task a chance.
+        *
+        * sched_slice() considers only this active rq and it gets the
+        * whole slice. But during force idle, we have siblings acting
+        * like a single runqueue and hence we need to consider runnable
+        * tasks on this CPU and the forced idle CPU. Ideally, we should
+        * go through the forced idle rq, but that would be a perf hit.
+        * We can assume that the forced idle CPU has at least
+        * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
+        * if we need to give up the CPU.
+        */
+       if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
+           __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
+               resched_curr(rq);
+}
+
+/*
+ * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
+ */
+static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
+{
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+               if (forceidle) {
+                       if (cfs_rq->forceidle_seq == fi_seq)
+                               break;
+                       cfs_rq->forceidle_seq = fi_seq;
+               }
+
+               cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+       }
+}
+
+void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
+{
+       struct sched_entity *se = &p->se;
+
+       if (p->sched_class != &fair_sched_class)
+               return;
+
+       se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
+}
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+{
+       struct rq *rq = task_rq(a);
+       struct sched_entity *sea = &a->se;
+       struct sched_entity *seb = &b->se;
+       struct cfs_rq *cfs_rqa;
+       struct cfs_rq *cfs_rqb;
+       s64 delta;
+
+       SCHED_WARN_ON(task_rq(b)->core != rq->core);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       /*
+        * Find an se in the hierarchy for tasks a and b, such that the se's
+        * are immediate siblings.
+        */
+       while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
+               int sea_depth = sea->depth;
+               int seb_depth = seb->depth;
+
+               if (sea_depth >= seb_depth)
+                       sea = parent_entity(sea);
+               if (sea_depth <= seb_depth)
+                       seb = parent_entity(seb);
+       }
+
+       se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
+       se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
+
+       cfs_rqa = sea->cfs_rq;
+       cfs_rqb = seb->cfs_rq;
+#else
+       cfs_rqa = &task_rq(a)->cfs;
+       cfs_rqb = &task_rq(b)->cfs;
+#endif
+
+       /*
+        * Find delta after normalizing se's vruntime with its cfs_rq's
+        * min_vruntime_fi, which would have been updated in prior calls
+        * to se_fi_update().
+        */
+       delta = (s64)(sea->vruntime - seb->vruntime) +
+               (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+
+       return delta > 0;
+}
+#else
+static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
+#endif
+
 /*
  * scheduler tick hitting a task of our scheduling class.
  *
@@ -10784,6 +11003,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 
        update_misfit_status(curr, rq);
        update_overutilized_status(task_rq(curr));
+
+       task_tick_core(rq, curr);
 }
 
 /*
@@ -10869,7 +11090,7 @@ static inline bool vruntime_normalized(struct task_struct *p)
         *   waiting for actually being woken up by sched_ttwu_pending().
         */
        if (!se->sum_exec_runtime ||
-           (p->state == TASK_WAKING && p->sched_remote_wakeup))
+           (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
                return true;
 
        return false;
@@ -11155,9 +11376,9 @@ void unregister_fair_sched_group(struct task_group *tg)
 
                rq = cpu_rq(cpu);
 
-               raw_spin_lock_irqsave(&rq->lock, flags);
+               raw_spin_rq_lock_irqsave(rq, flags);
                list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
+               raw_spin_rq_unlock_irqrestore(rq, flags);
        }
 }
 
@@ -11279,6 +11500,7 @@ DEFINE_SCHED_CLASS(fair) = {
 
 #ifdef CONFIG_SMP
        .balance                = balance_fair,
+       .pick_task              = pick_task_fair,
        .select_task_rq         = select_task_rq_fair,
        .migrate_task_rq        = migrate_task_rq_fair,