1 // SPDX-License-Identifier: GPL-2.0-only
2 #include "cgroup-internal.h"
4 #include <linux/sched/cputime.h>
8 #include <linux/btf_ids.h>
10 static DEFINE_SPINLOCK(cgroup_rstat_lock);
11 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
13 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
15 static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
17 return per_cpu_ptr(cgrp->rstat_cpu, cpu);
21 * cgroup_rstat_updated - keep track of updated rstat_cpu
22 * @cgrp: target cgroup
23 * @cpu: cpu on which rstat_cpu was updated
25 * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching
26 * rstat_cpu->updated_children list. See the comment on top of
27 * cgroup_rstat_cpu definition for details.
29 __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
31 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
35 * Speculative already-on-list test. This may race leading to
36 * temporary inaccuracies, which is fine.
38 * Because @parent's updated_children is terminated with @parent
39 * instead of NULL, we can tell whether @cgrp is on the list by
40 * testing the next pointer for NULL.
42 if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
45 raw_spin_lock_irqsave(cpu_lock, flags);
47 /* put @cgrp and all ancestors on the corresponding updated lists */
49 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
50 struct cgroup *parent = cgroup_parent(cgrp);
51 struct cgroup_rstat_cpu *prstatc;
54 * Both additions and removals are bottom-up. If a cgroup
55 * is already in the tree, all ancestors are.
57 if (rstatc->updated_next)
60 /* Root has no parent to link it to, but mark it busy */
62 rstatc->updated_next = cgrp;
66 prstatc = cgroup_rstat_cpu(parent, cpu);
67 rstatc->updated_next = prstatc->updated_children;
68 prstatc->updated_children = cgrp;
73 raw_spin_unlock_irqrestore(cpu_lock, flags);
77 * cgroup_rstat_push_children - push children cgroups into the given list
78 * @head: current head of the list (= subtree root)
79 * @child: first child of the root
81 * Return: A new singly linked list of cgroups to be flush
83 * Iteratively traverse down the cgroup_rstat_cpu updated tree level by
84 * level and push all the parents first before their next level children
85 * into a singly linked list built from the tail backward like "pushing"
86 * cgroups into a stack. The root is pushed by the caller.
88 static struct cgroup *cgroup_rstat_push_children(struct cgroup *head,
89 struct cgroup *child, int cpu)
91 struct cgroup *chead = child; /* Head of child cgroup level */
92 struct cgroup *ghead = NULL; /* Head of grandchild cgroup level */
93 struct cgroup *parent, *grandchild;
94 struct cgroup_rstat_cpu *crstatc;
96 child->rstat_flush_next = NULL;
101 chead = child->rstat_flush_next;
102 parent = cgroup_parent(child);
104 /* updated_next is parent cgroup terminated */
105 while (child != parent) {
106 child->rstat_flush_next = head;
108 crstatc = cgroup_rstat_cpu(child, cpu);
109 grandchild = crstatc->updated_children;
110 if (grandchild != child) {
111 /* Push the grand child to the next level */
112 crstatc->updated_children = child;
113 grandchild->rstat_flush_next = ghead;
116 child = crstatc->updated_next;
117 crstatc->updated_next = NULL;
130 * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed
131 * @root: root of the cgroup subtree to traverse
133 * Return: A singly linked list of cgroups to be flushed
135 * Walks the updated rstat_cpu tree on @cpu from @root. During traversal,
136 * each returned cgroup is unlinked from the updated tree.
138 * The only ordering guarantee is that, for a parent and a child pair
139 * covered by a given traversal, the child is before its parent in
142 * Note that updated_children is self terminated and points to a list of
143 * child cgroups if not empty. Whereas updated_next is like a sibling link
144 * within the children list and terminated by the parent cgroup. An exception
145 * here is the cgroup root whose updated_next can be self terminated.
147 static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
149 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
150 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
151 struct cgroup *head = NULL, *parent, *child;
155 * The _irqsave() is needed because cgroup_rstat_lock is
156 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
157 * this lock with the _irq() suffix only disables interrupts on
158 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
159 * interrupts on both configurations. The _irqsave() ensures
160 * that interrupts are always disabled and later restored.
162 raw_spin_lock_irqsave(cpu_lock, flags);
164 /* Return NULL if this subtree is not on-list */
165 if (!rstatc->updated_next)
169 * Unlink @root from its parent. As the updated_children list is
170 * singly linked, we have to walk it to find the removal point.
172 parent = cgroup_parent(root);
174 struct cgroup_rstat_cpu *prstatc;
175 struct cgroup **nextp;
177 prstatc = cgroup_rstat_cpu(parent, cpu);
178 nextp = &prstatc->updated_children;
179 while (*nextp != root) {
180 struct cgroup_rstat_cpu *nrstatc;
182 nrstatc = cgroup_rstat_cpu(*nextp, cpu);
183 WARN_ON_ONCE(*nextp == parent);
184 nextp = &nrstatc->updated_next;
186 *nextp = rstatc->updated_next;
189 rstatc->updated_next = NULL;
191 /* Push @root to the list first before pushing the children */
193 root->rstat_flush_next = NULL;
194 child = rstatc->updated_children;
195 rstatc->updated_children = root;
197 head = cgroup_rstat_push_children(head, child, cpu);
199 raw_spin_unlock_irqrestore(cpu_lock, flags);
204 * A hook for bpf stat collectors to attach to and flush their stats.
205 * Together with providing bpf kfuncs for cgroup_rstat_updated() and
206 * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
207 * collect cgroup stats can integrate with rstat for efficient flushing.
209 * A static noinline declaration here could cause the compiler to optimize away
210 * the function. A global noinline declaration will keep the definition, but may
211 * optimize away the callsite. Therefore, __weak is needed to ensure that the
212 * call is still emitted, by telling the compiler that we don't know what the
213 * function might eventually be.
218 __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
219 struct cgroup *parent, int cpu)
225 /* see cgroup_rstat_flush() */
226 static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
227 __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
231 lockdep_assert_held(&cgroup_rstat_lock);
233 for_each_possible_cpu(cpu) {
234 struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu);
236 for (; pos; pos = pos->rstat_flush_next) {
237 struct cgroup_subsys_state *css;
239 cgroup_base_stat_flush(pos, cpu);
240 bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
243 list_for_each_entry_rcu(css, &pos->rstat_css_list,
245 css->ss->css_rstat_flush(css, cpu);
249 /* play nice and yield if necessary */
250 if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
251 spin_unlock_irq(&cgroup_rstat_lock);
254 spin_lock_irq(&cgroup_rstat_lock);
260 * cgroup_rstat_flush - flush stats in @cgrp's subtree
261 * @cgrp: target cgroup
263 * Collect all per-cpu stats in @cgrp's subtree into the global counters
264 * and propagate them upwards. After this function returns, all cgroups in
265 * the subtree have up-to-date ->stat.
267 * This also gets all cgroups in the subtree including @cgrp off the
268 * ->updated_children lists.
270 * This function may block.
272 __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
276 spin_lock_irq(&cgroup_rstat_lock);
277 cgroup_rstat_flush_locked(cgrp);
278 spin_unlock_irq(&cgroup_rstat_lock);
282 * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
283 * @cgrp: target cgroup
285 * Flush stats in @cgrp's subtree and prevent further flushes. Must be
286 * paired with cgroup_rstat_flush_release().
288 * This function may block.
290 void cgroup_rstat_flush_hold(struct cgroup *cgrp)
291 __acquires(&cgroup_rstat_lock)
294 spin_lock_irq(&cgroup_rstat_lock);
295 cgroup_rstat_flush_locked(cgrp);
299 * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
301 void cgroup_rstat_flush_release(void)
302 __releases(&cgroup_rstat_lock)
304 spin_unlock_irq(&cgroup_rstat_lock);
307 int cgroup_rstat_init(struct cgroup *cgrp)
311 /* the root cgrp has rstat_cpu preallocated */
312 if (!cgrp->rstat_cpu) {
313 cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
314 if (!cgrp->rstat_cpu)
318 /* ->updated_children list is self terminated */
319 for_each_possible_cpu(cpu) {
320 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
322 rstatc->updated_children = cgrp;
323 u64_stats_init(&rstatc->bsync);
329 void cgroup_rstat_exit(struct cgroup *cgrp)
333 cgroup_rstat_flush(cgrp);
336 for_each_possible_cpu(cpu) {
337 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
339 if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
340 WARN_ON_ONCE(rstatc->updated_next))
344 free_percpu(cgrp->rstat_cpu);
345 cgrp->rstat_cpu = NULL;
348 void __init cgroup_rstat_boot(void)
352 for_each_possible_cpu(cpu)
353 raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
357 * Functions for cgroup basic resource statistics implemented on top of
360 static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
361 struct cgroup_base_stat *src_bstat)
363 dst_bstat->cputime.utime += src_bstat->cputime.utime;
364 dst_bstat->cputime.stime += src_bstat->cputime.stime;
365 dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
366 #ifdef CONFIG_SCHED_CORE
367 dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
371 static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
372 struct cgroup_base_stat *src_bstat)
374 dst_bstat->cputime.utime -= src_bstat->cputime.utime;
375 dst_bstat->cputime.stime -= src_bstat->cputime.stime;
376 dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
377 #ifdef CONFIG_SCHED_CORE
378 dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
382 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
384 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
385 struct cgroup *parent = cgroup_parent(cgrp);
386 struct cgroup_rstat_cpu *prstatc;
387 struct cgroup_base_stat delta;
390 /* Root-level stats are sourced from system-wide CPU stats */
394 /* fetch the current per-cpu values */
396 seq = __u64_stats_fetch_begin(&rstatc->bsync);
397 delta = rstatc->bstat;
398 } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
400 /* propagate per-cpu delta to cgroup and per-cpu global statistics */
401 cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
402 cgroup_base_stat_add(&cgrp->bstat, &delta);
403 cgroup_base_stat_add(&rstatc->last_bstat, &delta);
404 cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);
406 /* propagate cgroup and per-cpu global delta to parent (unless that's root) */
407 if (cgroup_parent(parent)) {
409 cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
410 cgroup_base_stat_add(&parent->bstat, &delta);
411 cgroup_base_stat_add(&cgrp->last_bstat, &delta);
413 delta = rstatc->subtree_bstat;
414 prstatc = cgroup_rstat_cpu(parent, cpu);
415 cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
416 cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
417 cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
421 static struct cgroup_rstat_cpu *
422 cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
424 struct cgroup_rstat_cpu *rstatc;
426 rstatc = get_cpu_ptr(cgrp->rstat_cpu);
427 *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
431 static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
432 struct cgroup_rstat_cpu *rstatc,
435 u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
436 cgroup_rstat_updated(cgrp, smp_processor_id());
440 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
442 struct cgroup_rstat_cpu *rstatc;
445 rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
446 rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
447 cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
450 void __cgroup_account_cputime_field(struct cgroup *cgrp,
451 enum cpu_usage_stat index, u64 delta_exec)
453 struct cgroup_rstat_cpu *rstatc;
456 rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
461 rstatc->bstat.cputime.utime += delta_exec;
465 case CPUTIME_SOFTIRQ:
466 rstatc->bstat.cputime.stime += delta_exec;
468 #ifdef CONFIG_SCHED_CORE
469 case CPUTIME_FORCEIDLE:
470 rstatc->bstat.forceidle_sum += delta_exec;
477 cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
481 * compute the cputime for the root cgroup by getting the per cpu data
482 * at a global level, then categorizing the fields in a manner consistent
483 * with how it is done by __cgroup_account_cputime_field for each bit of
484 * cpu time attributed to a cgroup.
486 static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
488 struct task_cputime *cputime = &bstat->cputime;
491 memset(bstat, 0, sizeof(*bstat));
492 for_each_possible_cpu(i) {
493 struct kernel_cpustat kcpustat;
494 u64 *cpustat = kcpustat.cpustat;
498 kcpustat_cpu_fetch(&kcpustat, i);
500 user += cpustat[CPUTIME_USER];
501 user += cpustat[CPUTIME_NICE];
502 cputime->utime += user;
504 sys += cpustat[CPUTIME_SYSTEM];
505 sys += cpustat[CPUTIME_IRQ];
506 sys += cpustat[CPUTIME_SOFTIRQ];
507 cputime->stime += sys;
509 cputime->sum_exec_runtime += user;
510 cputime->sum_exec_runtime += sys;
511 cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
513 #ifdef CONFIG_SCHED_CORE
514 bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
519 void cgroup_base_stat_cputime_show(struct seq_file *seq)
521 struct cgroup *cgrp = seq_css(seq)->cgroup;
522 u64 usage, utime, stime;
523 struct cgroup_base_stat bstat;
524 #ifdef CONFIG_SCHED_CORE
528 if (cgroup_parent(cgrp)) {
529 cgroup_rstat_flush_hold(cgrp);
530 usage = cgrp->bstat.cputime.sum_exec_runtime;
531 cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
533 #ifdef CONFIG_SCHED_CORE
534 forceidle_time = cgrp->bstat.forceidle_sum;
536 cgroup_rstat_flush_release();
538 root_cgroup_cputime(&bstat);
539 usage = bstat.cputime.sum_exec_runtime;
540 utime = bstat.cputime.utime;
541 stime = bstat.cputime.stime;
542 #ifdef CONFIG_SCHED_CORE
543 forceidle_time = bstat.forceidle_sum;
547 do_div(usage, NSEC_PER_USEC);
548 do_div(utime, NSEC_PER_USEC);
549 do_div(stime, NSEC_PER_USEC);
550 #ifdef CONFIG_SCHED_CORE
551 do_div(forceidle_time, NSEC_PER_USEC);
554 seq_printf(seq, "usage_usec %llu\n"
556 "system_usec %llu\n",
557 usage, utime, stime);
559 #ifdef CONFIG_SCHED_CORE
560 seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
564 /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
565 BTF_KFUNCS_START(bpf_rstat_kfunc_ids)
566 BTF_ID_FLAGS(func, cgroup_rstat_updated)
567 BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
568 BTF_KFUNCS_END(bpf_rstat_kfunc_ids)
570 static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
571 .owner = THIS_MODULE,
572 .set = &bpf_rstat_kfunc_ids,
575 static int __init bpf_rstat_kfunc_init(void)
577 return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
578 &bpf_rstat_kfunc_set);
580 late_initcall(bpf_rstat_kfunc_init);