Merge tag 'please-pull-sys_bpf' of git://git.kernel.org/pub/scm/linux/kernel/git...

[linux-2.6-microblaze.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index be9e97b..b78280c 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,7 @@
  #include <linux/latencytop.h>
  #include <linux/sched.h>
  #include <linux/cpumask.h>
+#include <linux/cpuidle.h>
  #include <linux/slab.h>
  #include <linux/profile.h>
  #include <linux/interrupt.h>
@@ -665,6 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  }
  
  #ifdef CONFIG_SMP
+static int select_idle_sibling(struct task_struct *p, int cpu);
  static unsigned long task_h_load(struct task_struct *p);
  
  static inline void __update_task_entity_contrib(struct sched_entity *se);
@@ -1257,6 +1259,13 @@ balance:
         if (load_too_imbalanced(src_load, dst_load, env))
                 goto unlock;
  
+       /*
+        * One idle CPU per node is evaluated for a task numa move.
+        * Call select_idle_sibling to maybe find a better one.
+        */
+       if (!cur)
+               env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+
  assign:
         task_numa_assign(env, cur, imp);
  unlock:
@@ -1809,10 +1818,6 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
         if (!p->mm)
                 return;
  
-       /* Do not worry about placement if exiting */
-       if (p->state == TASK_DEAD)
-               return;
-
         /* Allocate buffer to track faults on a per-node basis */
         if (unlikely(!p->numa_faults_memory)) {
                 int size = sizeof(*p->numa_faults_memory) *
@@ -1951,7 +1956,7 @@ void task_numa_work(struct callback_head *work)
                 vma = mm->mmap;
         }
         for (; vma; vma = vma->vm_next) {
-               if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
+               if (!vma_migratable(vma) || !vma_policy_mof(vma))
                         continue;
  
                 /*
@@ -2216,8 +2221,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
  
         /*
          * As y^PERIOD = 1/2, we can combine
-        *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
-        * With a look-up table which covers k^n (n<PERIOD)
+        *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
+        * With a look-up table which covers y^n (n<PERIOD)
          *
          * To achieve constant time decay_load.
          */
@@ -4087,7 +4092,7 @@ static unsigned long capacity_of(int cpu)
  static unsigned long cpu_avg_load_per_task(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
-       unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+       unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
         unsigned long load_avg = rq->cfs.runnable_load_avg;
  
         if (nr_running)
@@ -4276,8 +4281,8 @@ static int wake_wide(struct task_struct *p)
  static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
  {
         s64 this_load, load;
+       s64 this_eff_load, prev_eff_load;
         int idx, this_cpu, prev_cpu;
-       unsigned long tl_per_task;
         struct task_group *tg;
         unsigned long weight;
         int balanced;
@@ -4320,47 +4325,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
          * Otherwise check if either cpus are near enough in load to allow this
          * task to be woken on this_cpu.
          */
-       if (this_load > 0) {
-               s64 this_eff_load, prev_eff_load;
+       this_eff_load = 100;
+       this_eff_load *= capacity_of(prev_cpu);
  
-               this_eff_load = 100;
-               this_eff_load *= capacity_of(prev_cpu);
+       prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+       prev_eff_load *= capacity_of(this_cpu);
+
+       if (this_load > 0) {
                 this_eff_load *= this_load +
                         effective_load(tg, this_cpu, weight, weight);
  
-               prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-               prev_eff_load *= capacity_of(this_cpu);
                 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+       }
  
-               balanced = this_eff_load <= prev_eff_load;
-       } else
-               balanced = true;
-
-       /*
-        * If the currently running task will sleep within
-        * a reasonable amount of time then attract this newly
-        * woken task:
-        */
-       if (sync && balanced)
-               return 1;
+       balanced = this_eff_load <= prev_eff_load;
  
         schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
-       tl_per_task = cpu_avg_load_per_task(this_cpu);
  
-       if (balanced ||
-           (this_load <= load &&
-            this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
-               /*
-                * This domain has SD_WAKE_AFFINE and
-                * p is cache cold in this domain, and
-                * there is no bad imbalance.
-                */
-               schedstat_inc(sd, ttwu_move_affine);
-               schedstat_inc(p, se.statistics.nr_wakeups_affine);
+       if (!balanced)
+               return 0;
  
-               return 1;
-       }
-       return 0;
+       schedstat_inc(sd, ttwu_move_affine);
+       schedstat_inc(p, se.statistics.nr_wakeups_affine);
+
+       return 1;
  }
  
  /*
@@ -4428,20 +4416,46 @@ static int
  find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  {
         unsigned long load, min_load = ULONG_MAX;
-       int idlest = -1;
+       unsigned int min_exit_latency = UINT_MAX;
+       u64 latest_idle_timestamp = 0;
+       int least_loaded_cpu = this_cpu;
+       int shallowest_idle_cpu = -1;
         int i;
  
         /* Traverse only the allowed CPUs */
         for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
-               load = weighted_cpuload(i);
-
-               if (load < min_load || (load == min_load && i == this_cpu)) {
-                       min_load = load;
-                       idlest = i;
+               if (idle_cpu(i)) {
+                       struct rq *rq = cpu_rq(i);
+                       struct cpuidle_state *idle = idle_get_state(rq);
+                       if (idle && idle->exit_latency < min_exit_latency) {
+                               /*
+                                * We give priority to a CPU whose idle state
+                                * has the smallest exit latency irrespective
+                                * of any idle timestamp.
+                                */
+                               min_exit_latency = idle->exit_latency;
+                               latest_idle_timestamp = rq->idle_stamp;
+                               shallowest_idle_cpu = i;
+                       } else if ((!idle || idle->exit_latency == min_exit_latency) &&
+                                  rq->idle_stamp > latest_idle_timestamp) {
+                               /*
+                                * If equal or no active idle state, then
+                                * the most recently idled CPU might have
+                                * a warmer cache.
+                                */
+                               latest_idle_timestamp = rq->idle_stamp;
+                               shallowest_idle_cpu = i;
+                       }
+               } else {
+                       load = weighted_cpuload(i);
+                       if (load < min_load || (load == min_load && i == this_cpu)) {
+                               min_load = load;
+                               least_loaded_cpu = i;
+                       }
                 }
         }
  
-       return idlest;
+       return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
  }
  
  /*
@@ -4513,11 +4527,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
         if (p->nr_cpus_allowed == 1)
                 return prev_cpu;
  
-       if (sd_flag & SD_BALANCE_WAKE) {
-               if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
-                       want_affine = 1;
-               new_cpu = prev_cpu;
-       }
+       if (sd_flag & SD_BALANCE_WAKE)
+               want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
  
         rcu_read_lock();
         for_each_domain(cpu, tmp) {
@@ -5304,24 +5315,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
         if (!tsk_cache_hot)
                 tsk_cache_hot = migrate_degrades_locality(p, env);
  
-       if (migrate_improves_locality(p, env)) {
-#ifdef CONFIG_SCHEDSTATS
-               if (tsk_cache_hot) {
-                       schedstat_inc(env->sd, lb_hot_gained[env->idle]);
-                       schedstat_inc(p, se.statistics.nr_forced_migrations);
-               }
-#endif
-               return 1;
-       }
-
-       if (!tsk_cache_hot ||
-               env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-
+       if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
+           env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
                 if (tsk_cache_hot) {
                         schedstat_inc(env->sd, lb_hot_gained[env->idle]);
                         schedstat_inc(p, se.statistics.nr_forced_migrations);
                 }
-
                 return 1;
         }
  
@@ -5718,19 +5717,17 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
         return default_scale_capacity(sd, cpu);
  }
  
-static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
  {
-       unsigned long weight = sd->span_weight;
-       unsigned long smt_gain = sd->smt_gain;
-
-       smt_gain /= weight;
+       if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
+               return sd->smt_gain / sd->span_weight;
  
-       return smt_gain;
+       return SCHED_CAPACITY_SCALE;
  }
  
-unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
  {
-       return default_scale_smt_capacity(sd, cpu);
+       return default_scale_cpu_capacity(sd, cpu);
  }
  
  static unsigned long scale_rt_capacity(int cpu)
@@ -5769,18 +5766,15 @@ static unsigned long scale_rt_capacity(int cpu)
  
  static void update_cpu_capacity(struct sched_domain *sd, int cpu)
  {
-       unsigned long weight = sd->span_weight;
         unsigned long capacity = SCHED_CAPACITY_SCALE;
         struct sched_group *sdg = sd->groups;
  
-       if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
-               if (sched_feat(ARCH_CAPACITY))
-                       capacity *= arch_scale_smt_capacity(sd, cpu);
-               else
-                       capacity *= default_scale_smt_capacity(sd, cpu);
+       if (sched_feat(ARCH_CAPACITY))
+               capacity *= arch_scale_cpu_capacity(sd, cpu);
+       else
+               capacity *= default_scale_cpu_capacity(sd, cpu);
  
-               capacity >>= SCHED_CAPACITY_SHIFT;
-       }
+       capacity >>= SCHED_CAPACITY_SHIFT;
  
         sdg->sgc->capacity_orig = capacity;
  
@@ -5998,7 +5992,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                         load = source_load(i, load_idx);
  
                 sgs->group_load += load;
-               sgs->sum_nr_running += rq->nr_running;
+               sgs->sum_nr_running += rq->cfs.h_nr_running;
  
                 if (rq->nr_running > 1)
                         *overload = true;
@@ -6427,7 +6421,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
                 goto force_balance;
  
         /*
-        * If the local group is more busy than the selected busiest group
+        * If the local group is busier than the selected busiest group
          * don't try and pull any tasks.
          */
         if (local->avg_load >= busiest->avg_load)
@@ -6442,13 +6436,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
  
         if (env->idle == CPU_IDLE) {
                 /*
-                * This cpu is idle. If the busiest group load doesn't
-                * have more tasks than the number of available cpu's and
-                * there is no imbalance between this and busiest group
-                * wrt to idle cpu's, it is balanced.
+                * This cpu is idle. If the busiest group is not overloaded
+                * and there is no imbalance between this and busiest group
+                * wrt idle cpus, it is balanced. The imbalance becomes
+                * significant if the diff is greater than 1 otherwise we
+                * might end up to just move the imbalance on another group
                  */
-               if ((local->idle_cpus < busiest->idle_cpus) &&
-                   busiest->sum_nr_running <= busiest->group_weight)
+               if ((busiest->group_type != group_overloaded) &&
+                               (local->idle_cpus <= (busiest->idle_cpus + 1)))
                         goto out_balanced;
         } else {
                 /*
@@ -6706,12 +6701,6 @@ more_balance:
  
                 local_irq_restore(flags);
  
-               /*
-                * some other cpu did the load balance for us.
-                */
-               if (cur_ld_moved && env.dst_cpu != smp_processor_id())
-                       resched_cpu(env.dst_cpu);
-
                 if (env.flags & LBF_NEED_BREAK) {
                         env.flags &= ~LBF_NEED_BREAK;
                         goto more_balance;
@@ -6760,10 +6749,8 @@ more_balance:
                 if (sd_parent) {
                         int *group_imbalance = &sd_parent->groups->sgc->imbalance;
  
-                       if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+                       if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
                                 *group_imbalance = 1;
-                       } else if (*group_imbalance)
-                               *group_imbalance = 0;
                 }
  
                 /* All tasks on this runqueue were pinned by CPU affinity */
@@ -6774,7 +6761,7 @@ more_balance:
                                 env.loop_break = sched_nr_migrate_break;
                                 goto redo;
                         }
-                       goto out_balanced;
+                       goto out_all_pinned;
                 }
         }
  
@@ -6848,6 +6835,23 @@ more_balance:
         goto out;
  
  out_balanced:
+       /*
+        * We reach balance although we may have faced some affinity
+        * constraints. Clear the imbalance flag if it was set.
+        */
+       if (sd_parent) {
+               int *group_imbalance = &sd_parent->groups->sgc->imbalance;
+
+               if (*group_imbalance)
+                       *group_imbalance = 0;
+       }
+
+out_all_pinned:
+       /*
+        * We reach balance because all tasks are pinned at this level so
+        * we can't migrate them. Let the imbalance flag set so parent level
+        * can try to migrate them.
+        */
         schedstat_inc(sd, lb_balanced[idle]);
  
         sd->nr_balance_failed = 0;