Merge tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 14 Oct 2022 20:03:00 +0000 (13:03 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 14 Oct 2022 20:03:00 +0000 (13:03 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Oct 2022 20:03:00 +0000 (13:03 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 14 Oct 2022 20:03:00 +0000 (13:03 -0700)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst

index 7bcfb38..dc254a3 100644 (file)
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -976,6 +976,29 @@ All cgroup core files are prefixed with "cgroup."
         killing cgroups is a process directed operation, i.e. it affects
         the whole thread-group.
  
+  cgroup.pressure
+       A read-write single value file that allowed values are "0" and "1".
+       The default is "1".
+
+       Writing "0" to the file will disable the cgroup PSI accounting.
+       Writing "1" to the file will re-enable the cgroup PSI accounting.
+
+       This control attribute is not hierarchical, so disable or enable PSI
+       accounting in a cgroup does not affect PSI accounting in descendants
+       and doesn't need pass enablement via ancestors from root.
+
+       The reason this control attribute exists is that PSI accounts stalls for
+       each cgroup separately and aggregates it at each level of the hierarchy.
+       This may cause non-negligible overhead for some workloads when under
+       deep level of the hierarchy, in which case this control attribute can
+       be used to disable PSI accounting in the non-leaf cgroups.
+
+  irq.pressure
+       A read-write nested-keyed file.
+
+       Shows pressure stall information for IRQ/SOFTIRQ. See
+       :ref:`Documentation/accounting/psi.rst <psi>` for details.
+
  Controllers
  ===========
  
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h

index 8f481d1..6e01f10 100644 (file)
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -428,6 +428,9 @@ struct cgroup {
         struct cgroup_file procs_file;  /* handle for "cgroup.procs" */
         struct cgroup_file events_file; /* handle for "cgroup.events" */
  
+       /* handles for "{cpu,memory,io,irq}.pressure" */
+       struct cgroup_file psi_files[NR_PSI_RESOURCES];
+
         /*
          * The bitmask of subsystems enabled on the child cgroups.
          * ->subtree_control is the one configured through
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index 23b102b..f2a9f22 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -682,11 +682,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
         pr_cont_kernfs_path(cgrp->kn);
  }
  
-static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
-{
-       return cgrp->psi;
-}
-
  bool cgroup_psi_enabled(void);
  
  static inline void cgroup_init_kthreadd(void)
diff --git a/include/linux/psi.h b/include/linux/psi.h

index dd74411..b029a84 100644 (file)
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -7,6 +7,7 @@
  #include <linux/sched.h>
  #include <linux/poll.h>
  #include <linux/cgroup-defs.h>
+#include <linux/cgroup.h>
  
  struct seq_file;
  struct css_set;
@@ -18,10 +19,6 @@ extern struct psi_group psi_system;
  
  void psi_init(void);
  
-void psi_task_change(struct task_struct *task, int clear, int set);
-void psi_task_switch(struct task_struct *prev, struct task_struct *next,
-                    bool sleep);
-
  void psi_memstall_enter(unsigned long *flags);
  void psi_memstall_leave(unsigned long *flags);
  
@@ -34,9 +31,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
                         poll_table *wait);
  
  #ifdef CONFIG_CGROUPS
+static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
+{
+       return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+}
+
  int psi_cgroup_alloc(struct cgroup *cgrp);
  void psi_cgroup_free(struct cgroup *cgrp);
  void cgroup_move_task(struct task_struct *p, struct css_set *to);
+void psi_cgroup_restart(struct psi_group *group);
  #endif
  
  #else /* CONFIG_PSI */
@@ -58,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
  {
         rcu_assign_pointer(p->cgroups, to);
  }
+static inline void psi_cgroup_restart(struct psi_group *group) {}
  #endif
  
  #endif /* CONFIG_PSI */
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h

index c7fe7c0..6e43727 100644 (file)
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -15,13 +15,6 @@ enum psi_task_count {
         NR_IOWAIT,
         NR_MEMSTALL,
         NR_RUNNING,
-       /*
-        * This can't have values other than 0 or 1 and could be
-        * implemented as a bit flag. But for now we still have room
-        * in the first cacheline of psi_group_cpu, and this way we
-        * don't have to special case any state tracking for it.
-        */
-       NR_ONCPU,
         /*
          * For IO and CPU stalls the presence of running/oncpu tasks
          * in the domain means a partial rather than a full stall.
@@ -32,22 +25,27 @@ enum psi_task_count {
          * threads and memstall ones.
          */
         NR_MEMSTALL_RUNNING,
-       NR_PSI_TASK_COUNTS = 5,
+       NR_PSI_TASK_COUNTS = 4,
  };
  
  /* Task state bitmasks */
  #define TSK_IOWAIT     (1 << NR_IOWAIT)
  #define TSK_MEMSTALL   (1 << NR_MEMSTALL)
  #define TSK_RUNNING    (1 << NR_RUNNING)
-#define TSK_ONCPU      (1 << NR_ONCPU)
  #define TSK_MEMSTALL_RUNNING   (1 << NR_MEMSTALL_RUNNING)
  
+/* Only one task can be scheduled, no corresponding task count */
+#define TSK_ONCPU      (1 << NR_PSI_TASK_COUNTS)
+
  /* Resources that workloads could be stalled on */
  enum psi_res {
         PSI_IO,
         PSI_MEM,
         PSI_CPU,
-       NR_PSI_RESOURCES = 3,
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       PSI_IRQ,
+#endif
+       NR_PSI_RESOURCES,
  };
  
  /*
@@ -63,11 +61,17 @@ enum psi_states {
         PSI_MEM_FULL,
         PSI_CPU_SOME,
         PSI_CPU_FULL,
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       PSI_IRQ_FULL,
+#endif
         /* Only per-CPU, to weigh the CPU in the global average: */
         PSI_NONIDLE,
-       NR_PSI_STATES = 7,
+       NR_PSI_STATES,
  };
  
+/* Use one bit in the state mask to track TSK_ONCPU */
+#define PSI_ONCPU      (1 << NR_PSI_STATES)
+
  enum psi_aggregators {
         PSI_AVGS = 0,
         PSI_POLL,
@@ -147,6 +151,9 @@ struct psi_trigger {
  };
  
  struct psi_group {
+       struct psi_group *parent;
+       bool enabled;
+
         /* Protects data used by the aggregator */
         struct mutex avgs_lock;
  
@@ -188,6 +195,8 @@ struct psi_group {
  
  #else /* CONFIG_PSI */
  
+#define NR_PSI_RESOURCES       0
+
  struct psi_group { };
  
  #endif /* CONFIG_PSI */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c

index 764bdd5..7f48667 100644 (file)
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3698,27 +3698,27 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
  static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
  {
         struct cgroup *cgrp = seq_css(seq)->cgroup;
-       struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+       struct psi_group *psi = cgroup_psi(cgrp);
  
         return psi_show(seq, psi, PSI_IO);
  }
  static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
  {
         struct cgroup *cgrp = seq_css(seq)->cgroup;
-       struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+       struct psi_group *psi = cgroup_psi(cgrp);
  
         return psi_show(seq, psi, PSI_MEM);
  }
  static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
  {
         struct cgroup *cgrp = seq_css(seq)->cgroup;
-       struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+       struct psi_group *psi = cgroup_psi(cgrp);
  
         return psi_show(seq, psi, PSI_CPU);
  }
  
-static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
-                                         size_t nbytes, enum psi_res res)
+static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
+                             size_t nbytes, enum psi_res res)
  {
         struct cgroup_file_ctx *ctx = of->priv;
         struct psi_trigger *new;
@@ -3738,7 +3738,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
                 return -EBUSY;
         }
  
-       psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+       psi = cgroup_psi(cgrp);
         new = psi_trigger_create(psi, buf, res);
         if (IS_ERR(new)) {
                 cgroup_put(cgrp);
@@ -3755,21 +3755,86 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
                                           char *buf, size_t nbytes,
                                           loff_t off)
  {
-       return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+       return pressure_write(of, buf, nbytes, PSI_IO);
  }
  
  static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
                                           char *buf, size_t nbytes,
                                           loff_t off)
  {
-       return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+       return pressure_write(of, buf, nbytes, PSI_MEM);
  }
  
  static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
                                           char *buf, size_t nbytes,
                                           loff_t off)
  {
-       return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+       return pressure_write(of, buf, nbytes, PSI_CPU);
+}
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
+{
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+       struct psi_group *psi = cgroup_psi(cgrp);
+
+       return psi_show(seq, psi, PSI_IRQ);
+}
+
+static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
+                                        char *buf, size_t nbytes,
+                                        loff_t off)
+{
+       return pressure_write(of, buf, nbytes, PSI_IRQ);
+}
+#endif
+
+static int cgroup_pressure_show(struct seq_file *seq, void *v)
+{
+       struct cgroup *cgrp = seq_css(seq)->cgroup;
+       struct psi_group *psi = cgroup_psi(cgrp);
+
+       seq_printf(seq, "%d\n", psi->enabled);
+
+       return 0;
+}
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
+                                    char *buf, size_t nbytes,
+                                    loff_t off)
+{
+       ssize_t ret;
+       int enable;
+       struct cgroup *cgrp;
+       struct psi_group *psi;
+
+       ret = kstrtoint(strstrip(buf), 0, &enable);
+       if (ret)
+               return ret;
+
+       if (enable < 0 || enable > 1)
+               return -ERANGE;
+
+       cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!cgrp)
+               return -ENOENT;
+
+       psi = cgroup_psi(cgrp);
+       if (psi->enabled != enable) {
+               int i;
+
+               /* show or hide {cpu,memory,io,irq}.pressure files */
+               for (i = 0; i < NR_PSI_RESOURCES; i++)
+                       cgroup_file_show(&cgrp->psi_files[i], enable);
+
+               psi->enabled = enable;
+               if (enable)
+                       psi_cgroup_restart(psi);
+       }
+
+       cgroup_kn_unlock(of->kn);
+
+       return nbytes;
  }
  
  static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
@@ -3789,6 +3854,9 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
  
  bool cgroup_psi_enabled(void)
  {
+       if (static_branch_likely(&psi_disabled))
+               return false;
+
         return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
  }
  
@@ -5175,6 +5243,7 @@ static struct cftype cgroup_psi_files[] = {
  #ifdef CONFIG_PSI
         {
                 .name = "io.pressure",
+               .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
                 .seq_show = cgroup_io_pressure_show,
                 .write = cgroup_io_pressure_write,
                 .poll = cgroup_pressure_poll,
@@ -5182,6 +5251,7 @@ static struct cftype cgroup_psi_files[] = {
         },
         {
                 .name = "memory.pressure",
+               .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
                 .seq_show = cgroup_memory_pressure_show,
                 .write = cgroup_memory_pressure_write,
                 .poll = cgroup_pressure_poll,
@@ -5189,11 +5259,27 @@ static struct cftype cgroup_psi_files[] = {
         },
         {
                 .name = "cpu.pressure",
+               .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
                 .seq_show = cgroup_cpu_pressure_show,
                 .write = cgroup_cpu_pressure_write,
                 .poll = cgroup_pressure_poll,
                 .release = cgroup_pressure_release,
         },
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       {
+               .name = "irq.pressure",
+               .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
+               .seq_show = cgroup_irq_pressure_show,
+               .write = cgroup_irq_pressure_write,
+               .poll = cgroup_pressure_poll,
+               .release = cgroup_pressure_release,
+       },
+#endif
+       {
+               .name = "cgroup.pressure",
+               .seq_show = cgroup_pressure_show,
+               .write = cgroup_pressure_write,
+       },
  #endif /* CONFIG_PSI */
         { }     /* terminate */
  };
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index e4ce124..5800b06 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -701,6 +701,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
  
         rq->prev_irq_time += irq_delta;
         delta -= irq_delta;
+       psi_account_irqtime(rq->curr, irq_delta);
  #endif
  #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
         if (static_key_false((&paravirt_steal_rq_enabled))) {
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c

index 7f60300..ee2ecc0 100644 (file)
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -181,6 +181,7 @@ static void group_init(struct psi_group *group)
  {
         int cpu;
  
+       group->enabled = true;
         for_each_possible_cpu(cpu)
                 seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
         group->avg_last_update = sched_clock();
@@ -201,6 +202,7 @@ void __init psi_init(void)
  {
         if (!psi_enable) {
                 static_branch_enable(&psi_disabled);
+               static_branch_disable(&psi_cgroups_enabled);
                 return;
         }
  
@@ -211,7 +213,7 @@ void __init psi_init(void)
         group_init(&psi_system);
  }
  
-static bool test_state(unsigned int *tasks, enum psi_states state)
+static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
  {
         switch (state) {
         case PSI_IO_SOME:
@@ -224,9 +226,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
                 return unlikely(tasks[NR_MEMSTALL] &&
                         tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
         case PSI_CPU_SOME:
-               return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
+               return unlikely(tasks[NR_RUNNING] > oncpu);
         case PSI_CPU_FULL:
-               return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
+               return unlikely(tasks[NR_RUNNING] && !oncpu);
         case PSI_NONIDLE:
                 return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
                         tasks[NR_RUNNING];
@@ -688,35 +690,53 @@ static void psi_group_change(struct psi_group *group, int cpu,
                              bool wake_clock)
  {
         struct psi_group_cpu *groupc;
-       u32 state_mask = 0;
         unsigned int t, m;
         enum psi_states s;
+       u32 state_mask;
  
         groupc = per_cpu_ptr(group->pcpu, cpu);
  
         /*
-        * First we assess the aggregate resource states this CPU's
-        * tasks have been in since the last change, and account any
-        * SOME and FULL time these may have resulted in.
-        *
-        * Then we update the task counts according to the state
+        * First we update the task counts according to the state
          * change requested through the @clear and @set bits.
+        *
+        * Then if the cgroup PSI stats accounting enabled, we
+        * assess the aggregate resource states this CPU's tasks
+        * have been in since the last change, and account any
+        * SOME and FULL time these may have resulted in.
          */
         write_seqcount_begin(&groupc->seq);
  
-       record_times(groupc, now);
+       /*
+        * Start with TSK_ONCPU, which doesn't have a corresponding
+        * task count - it's just a boolean flag directly encoded in
+        * the state mask. Clear, set, or carry the current state if
+        * no changes are requested.
+        */
+       if (unlikely(clear & TSK_ONCPU)) {
+               state_mask = 0;
+               clear &= ~TSK_ONCPU;
+       } else if (unlikely(set & TSK_ONCPU)) {
+               state_mask = PSI_ONCPU;
+               set &= ~TSK_ONCPU;
+       } else {
+               state_mask = groupc->state_mask & PSI_ONCPU;
+       }
  
+       /*
+        * The rest of the state mask is calculated based on the task
+        * counts. Update those first, then construct the mask.
+        */
         for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
                 if (!(m & (1 << t)))
                         continue;
                 if (groupc->tasks[t]) {
                         groupc->tasks[t]--;
                 } else if (!psi_bug) {
-                       printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
+                       printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
                                         cpu, t, groupc->tasks[0],
                                         groupc->tasks[1], groupc->tasks[2],
-                                       groupc->tasks[3], groupc->tasks[4],
-                                       clear, set);
+                                       groupc->tasks[3], clear, set);
                         psi_bug = 1;
                 }
         }
@@ -725,9 +745,25 @@ static void psi_group_change(struct psi_group *group, int cpu,
                 if (set & (1 << t))
                         groupc->tasks[t]++;
  
-       /* Calculate state mask representing active states */
+       if (!group->enabled) {
+               /*
+                * On the first group change after disabling PSI, conclude
+                * the current state and flush its time. This is unlikely
+                * to matter to the user, but aggregation (get_recent_times)
+                * may have already incorporated the live state into times_prev;
+                * avoid a delta sample underflow when PSI is later re-enabled.
+                */
+               if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
+                       record_times(groupc, now);
+
+               groupc->state_mask = state_mask;
+
+               write_seqcount_end(&groupc->seq);
+               return;
+       }
+
         for (s = 0; s < NR_PSI_STATES; s++) {
-               if (test_state(groupc->tasks, s))
+               if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
                         state_mask |= (1 << s);
         }
  
@@ -739,9 +775,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
          * task in a cgroup is in_memstall, the corresponding groupc
          * on that cpu is in PSI_MEM_FULL state.
          */
-       if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
+       if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
                 state_mask |= (1 << PSI_MEM_FULL);
  
+       record_times(groupc, now);
+
         groupc->state_mask = state_mask;
  
         write_seqcount_end(&groupc->seq);
@@ -753,27 +791,12 @@ static void psi_group_change(struct psi_group *group, int cpu,
                 schedule_delayed_work(&group->avgs_work, PSI_FREQ);
  }
  
-static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
+static inline struct psi_group *task_psi_group(struct task_struct *task)
  {
-       if (*iter == &psi_system)
-               return NULL;
-
  #ifdef CONFIG_CGROUPS
-       if (static_branch_likely(&psi_cgroups_enabled)) {
-               struct cgroup *cgroup = NULL;
-
-               if (!*iter)
-                       cgroup = task->cgroups->dfl_cgrp;
-               else
-                       cgroup = cgroup_parent(*iter);
-
-               if (cgroup && cgroup_parent(cgroup)) {
-                       *iter = cgroup;
-                       return cgroup_psi(cgroup);
-               }
-       }
+       if (static_branch_likely(&psi_cgroups_enabled))
+               return cgroup_psi(task_dfl_cgroup(task));
  #endif
-       *iter = &psi_system;
         return &psi_system;
  }
  
@@ -796,8 +819,6 @@ void psi_task_change(struct task_struct *task, int clear, int set)
  {
         int cpu = task_cpu(task);
         struct psi_group *group;
-       bool wake_clock = true;
-       void *iter = NULL;
         u64 now;
  
         if (!task->pid)
@@ -806,19 +827,11 @@ void psi_task_change(struct task_struct *task, int clear, int set)
         psi_flags_change(task, clear, set);
  
         now = cpu_clock(cpu);
-       /*
-        * Periodic aggregation shuts off if there is a period of no
-        * task changes, so we wake it back up if necessary. However,
-        * don't do this if the task change is the aggregation worker
-        * itself going to sleep, or we'll ping-pong forever.
-        */
-       if (unlikely((clear & TSK_RUNNING) &&
-                    (task->flags & PF_WQ_WORKER) &&
-                    wq_worker_last_func(task) == psi_avgs_work))
-               wake_clock = false;
  
-       while ((group = iterate_groups(task, &iter)))
-               psi_group_change(group, cpu, clear, set, now, wake_clock);
+       group = task_psi_group(task);
+       do {
+               psi_group_change(group, cpu, clear, set, now, true);
+       } while ((group = group->parent));
  }
  
  void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -826,34 +839,30 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
  {
         struct psi_group *group, *common = NULL;
         int cpu = task_cpu(prev);
-       void *iter;
         u64 now = cpu_clock(cpu);
  
         if (next->pid) {
-               bool identical_state;
-
                 psi_flags_change(next, 0, TSK_ONCPU);
                 /*
-                * When switching between tasks that have an identical
-                * runtime state, the cgroup that contains both tasks
-                * we reach the first common ancestor. Iterate @next's
-                * ancestors only until we encounter @prev's ONCPU.
+                * Set TSK_ONCPU on @next's cgroups. If @next shares any
+                * ancestors with @prev, those will already have @prev's
+                * TSK_ONCPU bit set, and we can stop the iteration there.
                  */
-               identical_state = prev->psi_flags == next->psi_flags;
-               iter = NULL;
-               while ((group = iterate_groups(next, &iter))) {
-                       if (identical_state &&
-                           per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
+               group = task_psi_group(next);
+               do {
+                       if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
+                           PSI_ONCPU) {
                                 common = group;
                                 break;
                         }
  
                         psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
-               }
+               } while ((group = group->parent));
         }
  
         if (prev->pid) {
                 int clear = TSK_ONCPU, set = 0;
+               bool wake_clock = true;
  
                 /*
                  * When we're going to sleep, psi_dequeue() lets us
@@ -867,26 +876,74 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
                                 clear |= TSK_MEMSTALL_RUNNING;
                         if (prev->in_iowait)
                                 set |= TSK_IOWAIT;
+
+                       /*
+                        * Periodic aggregation shuts off if there is a period of no
+                        * task changes, so we wake it back up if necessary. However,
+                        * don't do this if the task change is the aggregation worker
+                        * itself going to sleep, or we'll ping-pong forever.
+                        */
+                       if (unlikely((prev->flags & PF_WQ_WORKER) &&
+                                    wq_worker_last_func(prev) == psi_avgs_work))
+                               wake_clock = false;
                 }
  
                 psi_flags_change(prev, clear, set);
  
-               iter = NULL;
-               while ((group = iterate_groups(prev, &iter)) && group != common)
-                       psi_group_change(group, cpu, clear, set, now, true);
+               group = task_psi_group(prev);
+               do {
+                       if (group == common)
+                               break;
+                       psi_group_change(group, cpu, clear, set, now, wake_clock);
+               } while ((group = group->parent));
  
                 /*
-                * TSK_ONCPU is handled up to the common ancestor. If we're tasked
-                * with dequeuing too, finish that for the rest of the hierarchy.
+                * TSK_ONCPU is handled up to the common ancestor. If there are
+                * any other differences between the two tasks (e.g. prev goes
+                * to sleep, or only one task is memstall), finish propagating
+                * those differences all the way up to the root.
                  */
-               if (sleep) {
+               if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
                         clear &= ~TSK_ONCPU;
-                       for (; group; group = iterate_groups(prev, &iter))
-                               psi_group_change(group, cpu, clear, set, now, true);
+                       for (; group; group = group->parent)
+                               psi_group_change(group, cpu, clear, set, now, wake_clock);
                 }
         }
  }
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct task_struct *task, u32 delta)
+{
+       int cpu = task_cpu(task);
+       struct psi_group *group;
+       struct psi_group_cpu *groupc;
+       u64 now;
+
+       if (!task->pid)
+               return;
+
+       now = cpu_clock(cpu);
+
+       group = task_psi_group(task);
+       do {
+               if (!group->enabled)
+                       continue;
+
+               groupc = per_cpu_ptr(group->pcpu, cpu);
+
+               write_seqcount_begin(&groupc->seq);
+
+               record_times(groupc, now);
+               groupc->times[PSI_IRQ_FULL] += delta;
+
+               write_seqcount_end(&groupc->seq);
+
+               if (group->poll_states & (1 << PSI_IRQ_FULL))
+                       psi_schedule_poll_work(group, 1);
+       } while ((group = group->parent));
+}
+#endif
+
  /**
   * psi_memstall_enter - mark the beginning of a memory stall section
   * @flags: flags to handle nested sections
@@ -952,7 +1009,7 @@ EXPORT_SYMBOL_GPL(psi_memstall_leave);
  #ifdef CONFIG_CGROUPS
  int psi_cgroup_alloc(struct cgroup *cgroup)
  {
-       if (static_branch_likely(&psi_disabled))
+       if (!static_branch_likely(&psi_cgroups_enabled))
                 return 0;
  
         cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
@@ -965,12 +1022,13 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
                 return -ENOMEM;
         }
         group_init(cgroup->psi);
+       cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup));
         return 0;
  }
  
  void psi_cgroup_free(struct cgroup *cgroup)
  {
-       if (static_branch_likely(&psi_disabled))
+       if (!static_branch_likely(&psi_cgroups_enabled))
                 return;
  
         cancel_delayed_work_sync(&cgroup->psi->avgs_work);
@@ -998,7 +1056,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
         struct rq_flags rf;
         struct rq *rq;
  
-       if (static_branch_likely(&psi_disabled)) {
+       if (!static_branch_likely(&psi_cgroups_enabled)) {
                 /*
                  * Lame to do this here, but the scheduler cannot be locked
                  * from the outside, so we move cgroups from inside sched/.
@@ -1046,10 +1104,45 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
  
         task_rq_unlock(rq, task, &rf);
  }
+
+void psi_cgroup_restart(struct psi_group *group)
+{
+       int cpu;
+
+       /*
+        * After we disable psi_group->enabled, we don't actually
+        * stop percpu tasks accounting in each psi_group_cpu,
+        * instead only stop test_state() loop, record_times()
+        * and averaging worker, see psi_group_change() for details.
+        *
+        * When disable cgroup PSI, this function has nothing to sync
+        * since cgroup pressure files are hidden and percpu psi_group_cpu
+        * would see !psi_group->enabled and only do task accounting.
+        *
+        * When re-enable cgroup PSI, this function use psi_group_change()
+        * to get correct state mask from test_state() loop on tasks[],
+        * and restart groupc->state_start from now, use .clear = .set = 0
+        * here since no task status really changed.
+        */
+       if (!group->enabled)
+               return;
+
+       for_each_possible_cpu(cpu) {
+               struct rq *rq = cpu_rq(cpu);
+               struct rq_flags rf;
+               u64 now;
+
+               rq_lock_irq(rq, &rf);
+               now = cpu_clock(cpu);
+               psi_group_change(group, cpu, 0, 0, now, true);
+               rq_unlock_irq(rq, &rf);
+       }
+}
  #endif /* CONFIG_CGROUPS */
  
  int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
  {
+       bool only_full = false;
         int full;
         u64 now;
  
@@ -1064,7 +1157,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
                 group->avg_next_update = update_averages(group, now);
         mutex_unlock(&group->avgs_lock);
  
-       for (full = 0; full < 2; full++) {
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       only_full = res == PSI_IRQ;
+#endif
+
+       for (full = 0; full < 2 - only_full; full++) {
                 unsigned long avg[3] = { 0, };
                 u64 total = 0;
                 int w;
@@ -1078,7 +1175,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
                 }
  
                 seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
-                          full ? "full" : "some",
+                          full || only_full ? "full" : "some",
                            LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
                            LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
                            LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
@@ -1106,6 +1203,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
         else
                 return ERR_PTR(-EINVAL);
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
+               return ERR_PTR(-EINVAL);
+#endif
+
         if (state >= PSI_NONIDLE)
                 return ERR_PTR(-EINVAL);
  
@@ -1390,6 +1492,33 @@ static const struct proc_ops psi_cpu_proc_ops = {
         .proc_release   = psi_fop_release,
  };
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int psi_irq_show(struct seq_file *m, void *v)
+{
+       return psi_show(m, &psi_system, PSI_IRQ);
+}
+
+static int psi_irq_open(struct inode *inode, struct file *file)
+{
+       return psi_open(file, psi_irq_show);
+}
+
+static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
+                            size_t nbytes, loff_t *ppos)
+{
+       return psi_write(file, user_buf, nbytes, PSI_IRQ);
+}
+
+static const struct proc_ops psi_irq_proc_ops = {
+       .proc_open      = psi_irq_open,
+       .proc_read      = seq_read,
+       .proc_lseek     = seq_lseek,
+       .proc_write     = psi_irq_write,
+       .proc_poll      = psi_fop_poll,
+       .proc_release   = psi_fop_release,
+};
+#endif
+
  static int __init psi_proc_init(void)
  {
         if (psi_enable) {
@@ -1397,6 +1526,9 @@ static int __init psi_proc_init(void)
                 proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
                 proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
                 proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+               proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
+#endif
         }
         return 0;
  }
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h

index baa839c..84a1889 100644 (file)
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -107,6 +107,11 @@ __schedstats_from_se(struct sched_entity *se)
  }
  
  #ifdef CONFIG_PSI
+void psi_task_change(struct task_struct *task, int clear, int set);
+void psi_task_switch(struct task_struct *prev, struct task_struct *next,
+                    bool sleep);
+void psi_account_irqtime(struct task_struct *task, u32 delta);
+
  /*
   * PSI tracks state that persists across sleeps, such as iowaits and
   * memory stalls. As a result, it has to distinguish between sleeps,
@@ -201,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
  static inline void psi_sched_switch(struct task_struct *prev,
                                     struct task_struct *next,
                                     bool sleep) {}
+static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
  #endif /* CONFIG_PSI */
  
  #ifdef CONFIG_SCHED_INFO
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 14 Oct 2022 20:03:00 +0000 (13:03 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 14 Oct 2022 20:03:00 +0000 (13:03 -0700)
Documentation/admin-guide/cgroup-v2.rst		patch \| blob \| history
include/linux/cgroup-defs.h		patch \| blob \| history
include/linux/cgroup.h		patch \| blob \| history
include/linux/psi.h		patch \| blob \| history
include/linux/psi_types.h		patch \| blob \| history
kernel/cgroup/cgroup.c		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/psi.c		patch \| blob \| history
kernel/sched/stats.h		patch \| blob \| history