Merge tag 'v5.6-rc3' into sched/core, to pick up fixes and dependent patches

[linux-2.6-microblaze.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index fe4e0d7..f38ff5a 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3516,7 +3516,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
   * attach_entity_load_avg - attach this entity to its cfs_rq load avg
   * @cfs_rq: cfs_rq to attach to
   * @se: sched_entity to attach
- * @flags: migration hints
   *
   * Must call update_cfs_rq_load_avg() before this, since we rely on
   * cfs_rq->avg.last_update_time being current.
@@ -5787,10 +5786,12 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
                 bool idle = true;
  
                 for_each_cpu(cpu, cpu_smt_mask(core)) {
-                       __cpumask_clear_cpu(cpu, cpus);
-                       if (!available_idle_cpu(cpu))
+                       if (!available_idle_cpu(cpu)) {
                                 idle = false;
+                               break;
+                       }
                 }
+               cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
  
                 if (idle)
                         return core;
@@ -5894,6 +5895,40 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
         return cpu;
  }
  
+/*
+ * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
+ * the task fits. If no CPU is big enough, but there are idle ones, try to
+ * maximize capacity.
+ */
+static int
+select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
+{
+       unsigned long best_cap = 0;
+       int cpu, best_cpu = -1;
+       struct cpumask *cpus;
+
+       sync_entity_load_avg(&p->se);
+
+       cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+       cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+       for_each_cpu_wrap(cpu, cpus, target) {
+               unsigned long cpu_cap = capacity_of(cpu);
+
+               if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+                       continue;
+               if (task_fits_capacity(p, cpu_cap))
+                       return cpu;
+
+               if (cpu_cap > best_cap) {
+                       best_cap = cpu_cap;
+                       best_cpu = cpu;
+               }
+       }
+
+       return best_cpu;
+}
+
  /*
   * Try and locate an idle core/thread in the LLC cache domain.
   */
@@ -5902,6 +5937,28 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
         struct sched_domain *sd;
         int i, recent_used_cpu;
  
+       /*
+        * For asymmetric CPU capacity systems, our domain of interest is
+        * sd_asym_cpucapacity rather than sd_llc.
+        */
+       if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+               sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
+               /*
+                * On an asymmetric CPU capacity system where an exclusive
+                * cpuset defines a symmetric island (i.e. one unique
+                * capacity_orig value through the cpuset), the key will be set
+                * but the CPUs within that cpuset will not have a domain with
+                * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
+                * capacity path.
+                */
+               if (!sd)
+                       goto symmetric;
+
+               i = select_idle_capacity(p, sd, target);
+               return ((unsigned)i < nr_cpumask_bits) ? i : target;
+       }
+
+symmetric:
         if (available_idle_cpu(target) || sched_idle_cpu(target))
                 return target;
  
@@ -5912,6 +5969,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
             (available_idle_cpu(prev) || sched_idle_cpu(prev)))
                 return prev;
  
+       /*
+        * Allow a per-cpu kthread to stack with the wakee if the
+        * kworker thread and the tasks previous CPUs are the same.
+        * The assumption is that the wakee queued work for the
+        * per-cpu kthread that is now complete and the wakeup is
+        * essentially a sync wakeup. An obvious example of this
+        * pattern is IO completions.
+        */
+       if (is_per_cpu_kthread(current) &&
+           prev == smp_processor_id() &&
+           this_rq()->nr_running <= 1) {
+               return prev;
+       }
+
         /* Check a recently used CPU as a potential idle candidate: */
         recent_used_cpu = p->recent_used_cpu;
         if (recent_used_cpu != prev &&
@@ -6087,33 +6158,6 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
         return min_t(unsigned long, util, capacity_orig_of(cpu));
  }
  
-/*
- * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
- * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
- *
- * In that case WAKE_AFFINE doesn't make sense and we'll let
- * BALANCE_WAKE sort things out.
- */
-static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
-{
-       long min_cap, max_cap;
-
-       if (!static_branch_unlikely(&sched_asym_cpucapacity))
-               return 0;
-
-       min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
-       max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
-
-       /* Minimum capacity is close to max, no need to abort wake_affine */
-       if (max_cap - min_cap < max_cap >> 3)
-               return 0;
-
-       /* Bring task utilization in sync with prev_cpu */
-       sync_entity_load_avg(&p->se);
-
-       return !task_fits_capacity(p, min_cap);
-}
-
  /*
   * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
   * to @dst_cpu.
@@ -6378,8 +6422,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                         new_cpu = prev_cpu;
                 }
  
-               want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
-                             cpumask_test_cpu(cpu, p->cpus_ptr);
+               want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
         }
  
         rcu_read_lock();
@@ -8658,10 +8701,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         /*
          * Try to use spare capacity of local group without overloading it or
          * emptying busiest.
-        * XXX Spreading tasks across NUMA nodes is not always the best policy
-        * and special care should be taken for SD_NUMA domain level before
-        * spreading the tasks. For now, load_balance() fully relies on
-        * NUMA_BALANCING and fbq_classify_group/rq to override the decision.
          */
         if (local->group_type == group_has_spare) {
                 if (busiest->group_type > group_fully_busy) {
@@ -8701,16 +8740,37 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                         env->migration_type = migrate_task;
                         lsub_positive(&nr_diff, local->sum_nr_running);
                         env->imbalance = nr_diff >> 1;
-                       return;
-               }
+               } else {
  
-               /*
-                * If there is no overload, we just want to even the number of
-                * idle cpus.
-                */
-               env->migration_type = migrate_task;
-               env->imbalance = max_t(long, 0, (local->idle_cpus -
+                       /*
+                        * If there is no overload, we just want to even the number of
+                        * idle cpus.
+                        */
+                       env->migration_type = migrate_task;
+                       env->imbalance = max_t(long, 0, (local->idle_cpus -
                                                  busiest->idle_cpus) >> 1);
+               }
+
+               /* Consider allowing a small imbalance between NUMA groups */
+               if (env->sd->flags & SD_NUMA) {
+                       unsigned int imbalance_min;
+
+                       /*
+                        * Compute an allowed imbalance based on a simple
+                        * pair of communicating tasks that should remain
+                        * local and ignore them.
+                        *
+                        * NOTE: Generally this would have been based on
+                        * the domain size and this was evaluated. However,
+                        * the benefit is similar across a range of workloads
+                        * and machines but scaling by the domain size adds
+                        * the risk that lower domains have to be rebalanced.
+                        */
+                       imbalance_min = 2;
+                       if (busiest->sum_nr_running <= imbalance_min)
+                               env->imbalance = 0;
+               }
+
                 return;
         }