* attach_entity_load_avg - attach this entity to its cfs_rq load avg
* @cfs_rq: cfs_rq to attach to
* @se: sched_entity to attach
- * @flags: migration hints
*
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
bool idle = true;
for_each_cpu(cpu, cpu_smt_mask(core)) {
- __cpumask_clear_cpu(cpu, cpus);
- if (!available_idle_cpu(cpu))
+ if (!available_idle_cpu(cpu)) {
idle = false;
+ break;
+ }
}
+ cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
if (idle)
return core;
return cpu;
}
+/*
+ * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
+ * the task fits. If no CPU is big enough, but there are idle ones, try to
+ * maximize capacity.
+ */
+static int
+select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
+{
+ unsigned long best_cap = 0;
+ int cpu, best_cpu = -1;
+ struct cpumask *cpus;
+
+ sync_entity_load_avg(&p->se);
+
+ cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+ for_each_cpu_wrap(cpu, cpus, target) {
+ unsigned long cpu_cap = capacity_of(cpu);
+
+ if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+ continue;
+ if (task_fits_capacity(p, cpu_cap))
+ return cpu;
+
+ if (cpu_cap > best_cap) {
+ best_cap = cpu_cap;
+ best_cpu = cpu;
+ }
+ }
+
+ return best_cpu;
+}
+
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
struct sched_domain *sd;
int i, recent_used_cpu;
+ /*
+ * For asymmetric CPU capacity systems, our domain of interest is
+ * sd_asym_cpucapacity rather than sd_llc.
+ */
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
+ /*
+ * On an asymmetric CPU capacity system where an exclusive
+ * cpuset defines a symmetric island (i.e. one unique
+ * capacity_orig value through the cpuset), the key will be set
+ * but the CPUs within that cpuset will not have a domain with
+ * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
+ * capacity path.
+ */
+ if (!sd)
+ goto symmetric;
+
+ i = select_idle_capacity(p, sd, target);
+ return ((unsigned)i < nr_cpumask_bits) ? i : target;
+ }
+
+symmetric:
if (available_idle_cpu(target) || sched_idle_cpu(target))
return target;
(available_idle_cpu(prev) || sched_idle_cpu(prev)))
return prev;
+ /*
+ * Allow a per-cpu kthread to stack with the wakee if the
+ * kworker thread and the tasks previous CPUs are the same.
+ * The assumption is that the wakee queued work for the
+ * per-cpu kthread that is now complete and the wakeup is
+ * essentially a sync wakeup. An obvious example of this
+ * pattern is IO completions.
+ */
+ if (is_per_cpu_kthread(current) &&
+ prev == smp_processor_id() &&
+ this_rq()->nr_running <= 1) {
+ return prev;
+ }
+
/* Check a recently used CPU as a potential idle candidate: */
recent_used_cpu = p->recent_used_cpu;
if (recent_used_cpu != prev &&
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
-/*
- * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
- * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
- *
- * In that case WAKE_AFFINE doesn't make sense and we'll let
- * BALANCE_WAKE sort things out.
- */
-static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
-{
- long min_cap, max_cap;
-
- if (!static_branch_unlikely(&sched_asym_cpucapacity))
- return 0;
-
- min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
- max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
-
- /* Minimum capacity is close to max, no need to abort wake_affine */
- if (max_cap - min_cap < max_cap >> 3)
- return 0;
-
- /* Bring task utilization in sync with prev_cpu */
- sync_entity_load_avg(&p->se);
-
- return !task_fits_capacity(p, min_cap);
-}
-
/*
* Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
* to @dst_cpu.
new_cpu = prev_cpu;
}
- want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
- cpumask_test_cpu(cpu, p->cpus_ptr);
+ want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
}
rcu_read_lock();
/*
* Try to use spare capacity of local group without overloading it or
* emptying busiest.
- * XXX Spreading tasks across NUMA nodes is not always the best policy
- * and special care should be taken for SD_NUMA domain level before
- * spreading the tasks. For now, load_balance() fully relies on
- * NUMA_BALANCING and fbq_classify_group/rq to override the decision.
*/
if (local->group_type == group_has_spare) {
if (busiest->group_type > group_fully_busy) {
env->migration_type = migrate_task;
lsub_positive(&nr_diff, local->sum_nr_running);
env->imbalance = nr_diff >> 1;
- return;
- }
+ } else {
- /*
- * If there is no overload, we just want to even the number of
- * idle cpus.
- */
- env->migration_type = migrate_task;
- env->imbalance = max_t(long, 0, (local->idle_cpus -
+ /*
+ * If there is no overload, we just want to even the number of
+ * idle cpus.
+ */
+ env->migration_type = migrate_task;
+ env->imbalance = max_t(long, 0, (local->idle_cpus -
busiest->idle_cpus) >> 1);
+ }
+
+ /* Consider allowing a small imbalance between NUMA groups */
+ if (env->sd->flags & SD_NUMA) {
+ unsigned int imbalance_min;
+
+ /*
+ * Compute an allowed imbalance based on a simple
+ * pair of communicating tasks that should remain
+ * local and ignore them.
+ *
+ * NOTE: Generally this would have been based on
+ * the domain size and this was evaluated. However,
+ * the benefit is similar across a range of workloads
+ * and machines but scaling by the domain size adds
+ * the risk that lower domains have to be rebalanced.
+ */
+ imbalance_min = 2;
+ if (busiest->sum_nr_running <= imbalance_min)
+ env->imbalance = 0;
+ }
+
return;
}