psi: Optimize task switch inside shared cgroups
[linux-2.6-microblaze.git] / kernel / sched / psi.c
index 967732c..ee3c5b4 100644 (file)
  * delayed on that resource such that nobody is advancing and the CPU
  * goes idle. This leaves both workload and CPU unproductive.
  *
- * (Naturally, the FULL state doesn't exist for the CPU resource.)
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level, means all non-idle tasks
+ * in a cgroup are delayed on the CPU resource which used by others outside
+ * of the cgroup or throttled by the cgroup cpu.max configuration.
  *
  *     SOME = nr_delayed_tasks != 0
  *     FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
@@ -216,15 +219,17 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
 {
        switch (state) {
        case PSI_IO_SOME:
-               return tasks[NR_IOWAIT];
+               return unlikely(tasks[NR_IOWAIT]);
        case PSI_IO_FULL:
-               return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
+               return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]);
        case PSI_MEM_SOME:
-               return tasks[NR_MEMSTALL];
+               return unlikely(tasks[NR_MEMSTALL]);
        case PSI_MEM_FULL:
-               return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
+               return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
        case PSI_CPU_SOME:
-               return tasks[NR_RUNNING] > tasks[NR_ONCPU];
+               return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
+       case PSI_CPU_FULL:
+               return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
        case PSI_NONIDLE:
                return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
                        tasks[NR_RUNNING];
@@ -639,8 +644,7 @@ static void poll_timer_fn(struct timer_list *t)
        wake_up_interruptible(&group->poll_wait);
 }
 
-static void record_times(struct psi_group_cpu *groupc, int cpu,
-                        bool memstall_tick)
+static void record_times(struct psi_group_cpu *groupc, int cpu)
 {
        u32 delta;
        u64 now;
@@ -659,27 +663,13 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
                groupc->times[PSI_MEM_SOME] += delta;
                if (groupc->state_mask & (1 << PSI_MEM_FULL))
                        groupc->times[PSI_MEM_FULL] += delta;
-               else if (memstall_tick) {
-                       u32 sample;
-                       /*
-                        * Since we care about lost potential, a
-                        * memstall is FULL when there are no other
-                        * working tasks, but also when the CPU is
-                        * actively reclaiming and nothing productive
-                        * could run even if it were runnable.
-                        *
-                        * When the timer tick sees a reclaiming CPU,
-                        * regardless of runnable tasks, sample a FULL
-                        * tick (or less if it hasn't been a full tick
-                        * since the last state change).
-                        */
-                       sample = min(delta, (u32)jiffies_to_nsecs(1));
-                       groupc->times[PSI_MEM_FULL] += sample;
-               }
        }
 
-       if (groupc->state_mask & (1 << PSI_CPU_SOME))
+       if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
                groupc->times[PSI_CPU_SOME] += delta;
+               if (groupc->state_mask & (1 << PSI_CPU_FULL))
+                       groupc->times[PSI_CPU_FULL] += delta;
+       }
 
        if (groupc->state_mask & (1 << PSI_NONIDLE))
                groupc->times[PSI_NONIDLE] += delta;
@@ -706,7 +696,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
         */
        write_seqcount_begin(&groupc->seq);
 
-       record_times(groupc, cpu, false);
+       record_times(groupc, cpu);
 
        for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
                if (!(m & (1 << t)))
@@ -730,6 +720,18 @@ static void psi_group_change(struct psi_group *group, int cpu,
                if (test_state(groupc->tasks, s))
                        state_mask |= (1 << s);
        }
+
+       /*
+        * Since we care about lost potential, a memstall is FULL
+        * when there are no other working tasks, but also when
+        * the CPU is actively reclaiming and nothing productive
+        * could run even if it were runnable. So when the current
+        * task in a cgroup is in_memstall, the corresponding groupc
+        * on that cpu is in PSI_MEM_FULL state.
+        */
+       if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
+               state_mask |= (1 << PSI_MEM_FULL);
+
        groupc->state_mask = state_mask;
 
        write_seqcount_end(&groupc->seq);
@@ -815,17 +817,21 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
        void *iter;
 
        if (next->pid) {
+               bool identical_state;
+
                psi_flags_change(next, 0, TSK_ONCPU);
                /*
-                * When moving state between tasks, the group that
-                * contains them both does not change: we can stop
-                * updating the tree once we reach the first common
-                * ancestor. Iterate @next's ancestors until we
-                * encounter @prev's state.
+                * When switching between tasks that have an identical
+                * runtime state, the cgroup that contains both tasks
+                * runtime state, the cgroup that contains both tasks
+                * we reach the first common ancestor. Iterate @next's
+                * ancestors only until we encounter @prev's ONCPU.
                 */
+               identical_state = prev->psi_flags == next->psi_flags;
                iter = NULL;
                while ((group = iterate_groups(next, &iter))) {
-                       if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+                       if (identical_state &&
+                           per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
                                common = group;
                                break;
                        }
@@ -834,35 +840,35 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
                }
        }
 
-       /*
-        * If this is a voluntary sleep, dequeue will have taken care
-        * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
-        * only need to deal with it during preemption.
-        */
-       if (sleep)
-               return;
-
        if (prev->pid) {
-               psi_flags_change(prev, TSK_ONCPU, 0);
+               int clear = TSK_ONCPU, set = 0;
 
-               iter = NULL;
-               while ((group = iterate_groups(prev, &iter)) && group != common)
-                       psi_group_change(group, cpu, TSK_ONCPU, 0, true);
-       }
-}
+               /*
+                * When we're going to sleep, psi_dequeue() lets us handle
+                * TSK_RUNNING and TSK_IOWAIT here, where we can combine it
+                * with TSK_ONCPU and save walking common ancestors twice.
+                */
+               if (sleep) {
+                       clear |= TSK_RUNNING;
+                       if (prev->in_iowait)
+                               set |= TSK_IOWAIT;
+               }
 
-void psi_memstall_tick(struct task_struct *task, int cpu)
-{
-       struct psi_group *group;
-       void *iter = NULL;
+               psi_flags_change(prev, clear, set);
 
-       while ((group = iterate_groups(task, &iter))) {
-               struct psi_group_cpu *groupc;
+               iter = NULL;
+               while ((group = iterate_groups(prev, &iter)) && group != common)
+                       psi_group_change(group, cpu, clear, set, true);
 
-               groupc = per_cpu_ptr(group->pcpu, cpu);
-               write_seqcount_begin(&groupc->seq);
-               record_times(groupc, cpu, true);
-               write_seqcount_end(&groupc->seq);
+               /*
+                * TSK_ONCPU is handled up to the common ancestor. If we're tasked
+                * with dequeuing too, finish that for the rest of the hierarchy.
+                */
+               if (sleep) {
+                       clear &= ~TSK_ONCPU;
+                       for (; group; group = iterate_groups(prev, &iter))
+                               psi_group_change(group, cpu, clear, set, true);
+               }
        }
 }
 
@@ -1018,7 +1024,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
                group->avg_next_update = update_averages(group, now);
        mutex_unlock(&group->avgs_lock);
 
-       for (full = 0; full < 2 - (res == PSI_CPU); full++) {
+       for (full = 0; full < 2; full++) {
                unsigned long avg[3];
                u64 total;
                int w;