Merge tag 'kthread-cleanups-for-v5.19' of git://git.kernel.org/pub/scm/linux/kernel...

[linux-2.6-microblaze.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index db6f0df..77b2048 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -36,6 +36,7 @@
  #include <linux/sched/cond_resched.h>
  #include <linux/sched/cputime.h>
  #include <linux/sched/isolation.h>
+#include <linux/sched/nohz.h>
  
  #include <linux/cpuidle.h>
  #include <linux/interrupt.h>
@@ -173,7 +174,37 @@ int __weak arch_asym_cpu_priority(int cpu)
   *
   * (default: 5 msec, units: microseconds)
   */
-unsigned int sysctl_sched_cfs_bandwidth_slice          = 5000UL;
+static unsigned int sysctl_sched_cfs_bandwidth_slice           = 5000UL;
+#endif
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_fair_sysctls[] = {
+       {
+               .procname       = "sched_child_runs_first",
+               .data           = &sysctl_sched_child_runs_first,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+#ifdef CONFIG_CFS_BANDWIDTH
+       {
+               .procname       = "sched_cfs_bandwidth_slice_us",
+               .data           = &sysctl_sched_cfs_bandwidth_slice,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = SYSCTL_ONE,
+       },
+#endif
+       {}
+};
+
+static int __init sched_fair_sysctl_init(void)
+{
+       register_sysctl_init("kernel", sched_fair_sysctls);
+       return 0;
+}
+late_initcall(sched_fair_sysctl_init);
  #endif
  
  static inline void update_load_add(struct load_weight *lw, unsigned long inc)
@@ -313,19 +344,6 @@ const struct sched_class fair_sched_class;
  #define for_each_sched_entity(se) \
                 for (; se; se = se->parent)
  
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
-{
-       if (!path)
-               return;
-
-       if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
-               autogroup_path(cfs_rq->tg, path, len);
-       else if (cfs_rq && cfs_rq->tg->css.cgroup)
-               cgroup_path(cfs_rq->tg->css.cgroup, path, len);
-       else
-               strlcpy(path, "(null)", len);
-}
-
  static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
         struct rq *rq = rq_of(cfs_rq);
@@ -493,12 +511,6 @@ static int se_is_idle(struct sched_entity *se)
  #define for_each_sched_entity(se) \
                 for (; se; se = NULL)
  
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
-{
-       if (path)
-               strlcpy(path, "(null)", len);
-}
-
  static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
         return true;
@@ -3829,11 +3841,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
  
         se->avg.runnable_sum = se->avg.runnable_avg * divider;
  
-       se->avg.load_sum = divider;
-       if (se_weight(se)) {
-               se->avg.load_sum =
-                       div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
-       }
+       se->avg.load_sum = se->avg.load_avg * divider;
+       if (se_weight(se) < se->avg.load_sum)
+               se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
+       else
+               se->avg.load_sum = 1;
  
         enqueue_load_avg(cfs_rq, se);
         cfs_rq->avg.util_avg += se->avg.util_avg;
@@ -4846,11 +4858,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
  
         cfs_rq->throttle_count--;
         if (!cfs_rq->throttle_count) {
-               cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
-                                            cfs_rq->throttled_clock_task;
+               cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+                                            cfs_rq->throttled_clock_pelt;
  
                 /* Add cfs_rq with load or one or more already running entities to the list */
-               if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
+               if (!cfs_rq_is_decayed(cfs_rq))
                         list_add_leaf_cfs_rq(cfs_rq);
         }
  
@@ -4864,7 +4876,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
  
         /* group is entering throttled state, stop time */
         if (!cfs_rq->throttle_count) {
-               cfs_rq->throttled_clock_task = rq_clock_task(rq);
+               cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
                 list_del_leaf_cfs_rq(cfs_rq);
         }
         cfs_rq->throttle_count++;
@@ -5308,7 +5320,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
         pcfs_rq = tg->parent->cfs_rq[cpu];
  
         cfs_rq->throttle_count = pcfs_rq->throttle_count;
-       cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+       cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
  }
  
  /* conditionally throttle active cfs_rq's from put_prev_entity() */
@@ -6544,108 +6556,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
  }
  
  /*
- * cpu_util_without: compute cpu utilization without any contributions from *p
- * @cpu: the CPU which utilization is requested
- * @p: the task which utilization should be discounted
- *
- * The utilization of a CPU is defined by the utilization of tasks currently
- * enqueued on that CPU as well as tasks which are currently sleeping after an
- * execution on that CPU.
- *
- * This method returns the utilization of the specified CPU by discounting the
- * utilization of the specified task, whenever the task is currently
- * contributing to the CPU utilization.
- */
-static unsigned long cpu_util_without(int cpu, struct task_struct *p)
-{
-       struct cfs_rq *cfs_rq;
-       unsigned int util;
-
-       /* Task has no contribution or is new */
-       if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
-               return cpu_util_cfs(cpu);
-
-       cfs_rq = &cpu_rq(cpu)->cfs;
-       util = READ_ONCE(cfs_rq->avg.util_avg);
-
-       /* Discount task's util from CPU's util */
-       lsub_positive(&util, task_util(p));
-
-       /*
-        * Covered cases:
-        *
-        * a) if *p is the only task sleeping on this CPU, then:
-        *      cpu_util (== task_util) > util_est (== 0)
-        *    and thus we return:
-        *      cpu_util_without = (cpu_util - task_util) = 0
-        *
-        * b) if other tasks are SLEEPING on this CPU, which is now exiting
-        *    IDLE, then:
-        *      cpu_util >= task_util
-        *      cpu_util > util_est (== 0)
-        *    and thus we discount *p's blocked utilization to return:
-        *      cpu_util_without = (cpu_util - task_util) >= 0
-        *
-        * c) if other tasks are RUNNABLE on that CPU and
-        *      util_est > cpu_util
-        *    then we use util_est since it returns a more restrictive
-        *    estimation of the spare capacity on that CPU, by just
-        *    considering the expected utilization of tasks already
-        *    runnable on that CPU.
-        *
-        * Cases a) and b) are covered by the above code, while case c) is
-        * covered by the following code when estimated utilization is
-        * enabled.
-        */
-       if (sched_feat(UTIL_EST)) {
-               unsigned int estimated =
-                       READ_ONCE(cfs_rq->avg.util_est.enqueued);
-
-               /*
-                * Despite the following checks we still have a small window
-                * for a possible race, when an execl's select_task_rq_fair()
-                * races with LB's detach_task():
-                *
-                *   detach_task()
-                *     p->on_rq = TASK_ON_RQ_MIGRATING;
-                *     ---------------------------------- A
-                *     deactivate_task()                   \
-                *       dequeue_task()                     + RaceTime
-                *         util_est_dequeue()              /
-                *     ---------------------------------- B
-                *
-                * The additional check on "current == p" it's required to
-                * properly fix the execl regression and it helps in further
-                * reducing the chances for the above race.
-                */
-               if (unlikely(task_on_rq_queued(p) || current == p))
-                       lsub_positive(&estimated, _task_util_est(p));
-
-               util = max(util, estimated);
-       }
-
-       /*
-        * Utilization (estimated) can exceed the CPU capacity, thus let's
-        * clamp to the maximum CPU capacity to ensure consistency with
-        * cpu_util.
-        */
-       return min_t(unsigned long, util, capacity_orig_of(cpu));
-}
-
-/*
- * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
- * to @dst_cpu.
+ * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu
+ * (@dst_cpu = -1) or migrated to @dst_cpu.
   */
  static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
  {
         struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
-       unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
+       unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
  
         /*
-        * If @p migrates from @cpu to another, remove its contribution. Or,
-        * if @p migrates from another CPU to @cpu, add its contribution. In
-        * the other cases, @cpu is not impacted by the migration, so the
-        * util_avg should already be correct.
+        * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
+        * contribution. If @p migrates from another CPU to @cpu add its
+        * contribution. In all the other cases @cpu is not impacted by the
+        * migration so its util_avg is already correct.
          */
         if (task_cpu(p) == cpu && dst_cpu != cpu)
                 lsub_positive(&util, task_util(p));
@@ -6653,16 +6576,40 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
                 util += task_util(p);
  
         if (sched_feat(UTIL_EST)) {
+               unsigned long util_est;
+
                 util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
  
                 /*
-                * During wake-up, the task isn't enqueued yet and doesn't
-                * appear in the cfs_rq->avg.util_est.enqueued of any rq,
-                * so just add it (if needed) to "simulate" what will be
-                * cpu_util after the task has been enqueued.
+                * During wake-up @p isn't enqueued yet and doesn't contribute
+                * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
+                * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
+                * has been enqueued.
+                *
+                * During exec (@dst_cpu = -1) @p is enqueued and does
+                * contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
+                * Remove it to "simulate" cpu_util without @p's contribution.
+                *
+                * Despite the task_on_rq_queued(@p) check there is still a
+                * small window for a possible race when an exec
+                * select_task_rq_fair() races with LB's detach_task().
+                *
+                *   detach_task()
+                *     deactivate_task()
+                *       p->on_rq = TASK_ON_RQ_MIGRATING;
+                *       -------------------------------- A
+                *       dequeue_task()                    \
+                *         dequeue_task_fair()              + Race Time
+                *           util_est_dequeue()            /
+                *       -------------------------------- B
+                *
+                * The additional check "current == p" is required to further
+                * reduce the race window.
                  */
                 if (dst_cpu == cpu)
                         util_est += _task_util_est(p);
+               else if (unlikely(task_on_rq_queued(p) || current == p))
+                       lsub_positive(&util_est, _task_util_est(p));
  
                 util = max(util, util_est);
         }
@@ -6670,6 +6617,28 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
         return min(util, capacity_orig_of(cpu));
  }
  
+/*
+ * cpu_util_without: compute cpu utilization without any contributions from *p
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
+ */
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
+{
+       /* Task has no contribution or is new */
+       if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+               return cpu_util_cfs(cpu);
+
+       return cpu_util_next(cpu, p, -1);
+}
+
  /*
   * compute_energy(): Estimates the energy that @pd would consume if @p was
   * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
@@ -9460,8 +9429,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                 local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
                                   local->group_capacity;
  
-               sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
-                               sds->total_capacity;
                 /*
                  * If the local group is more loaded than the selected
                  * busiest group don't try to pull any tasks.
@@ -9470,6 +9437,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                         env->imbalance = 0;
                         return;
                 }
+
+               sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
+                               sds->total_capacity;
         }
  
         /*
@@ -9495,7 +9465,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
   * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
   * has_spare        nr_idle   balanced   N/A    N/A  balanced   balanced
   * fully_busy       nr_idle   nr_idle    N/A    N/A  balanced   balanced
- * misfit_task      force     N/A        N/A    N/A  force      force
+ * misfit_task      force     N/A        N/A    N/A  N/A        N/A
   * asym_packing     force     force      N/A    N/A  force      force
   * imbalanced       force     force      N/A    N/A  force      force
   * overloaded       force     force      N/A    N/A  force      avg_load
@@ -11881,101 +11851,3 @@ __init void init_sched_fair_class(void)
  #endif /* SMP */
  
  }
-
-/*
- * Helper functions to facilitate extracting info from tracepoints.
- */
-
-const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
-{
-#ifdef CONFIG_SMP
-       return cfs_rq ? &cfs_rq->avg : NULL;
-#else
-       return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
-
-char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
-{
-       if (!cfs_rq) {
-               if (str)
-                       strlcpy(str, "(null)", len);
-               else
-                       return NULL;
-       }
-
-       cfs_rq_tg_path(cfs_rq, str, len);
-       return str;
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
-
-int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
-{
-       return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
-
-const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
-{
-#ifdef CONFIG_SMP
-       return rq ? &rq->avg_rt : NULL;
-#else
-       return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
-
-const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
-{
-#ifdef CONFIG_SMP
-       return rq ? &rq->avg_dl : NULL;
-#else
-       return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
-
-const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
-{
-#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
-       return rq ? &rq->avg_irq : NULL;
-#else
-       return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
-
-int sched_trace_rq_cpu(struct rq *rq)
-{
-       return rq ? cpu_of(rq) : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
-
-int sched_trace_rq_cpu_capacity(struct rq *rq)
-{
-       return rq ?
-#ifdef CONFIG_SMP
-               rq->cpu_capacity
-#else
-               SCHED_CAPACITY_SCALE
-#endif
-               : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
-
-const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
-{
-#ifdef CONFIG_SMP
-       return rd ? rd->span : NULL;
-#else
-       return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rd_span);
-
-int sched_trace_rq_nr_running(struct rq *rq)
-{
-        return rq ? rq->nr_running : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);