sizes.h: add SZ_8G/SZ_16G/SZ_32G macros
[linux-2.6-microblaze.git] / kernel / cgroup / rstat.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include "cgroup-internal.h"
3
4 #include <linux/sched/cputime.h>
5
6 static DEFINE_SPINLOCK(cgroup_rstat_lock);
7 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
8
9 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
10
11 static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
12 {
13         return per_cpu_ptr(cgrp->rstat_cpu, cpu);
14 }
15
16 /**
17  * cgroup_rstat_updated - keep track of updated rstat_cpu
18  * @cgrp: target cgroup
19  * @cpu: cpu on which rstat_cpu was updated
20  *
21  * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
22  * rstat_cpu->updated_children list.  See the comment on top of
23  * cgroup_rstat_cpu definition for details.
24  */
25 void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
26 {
27         raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
28         struct cgroup *parent;
29         unsigned long flags;
30
31         /* nothing to do for root */
32         if (!cgroup_parent(cgrp))
33                 return;
34
35         /*
36          * Speculative already-on-list test. This may race leading to
37          * temporary inaccuracies, which is fine.
38          *
39          * Because @parent's updated_children is terminated with @parent
40          * instead of NULL, we can tell whether @cgrp is on the list by
41          * testing the next pointer for NULL.
42          */
43         if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
44                 return;
45
46         raw_spin_lock_irqsave(cpu_lock, flags);
47
48         /* put @cgrp and all ancestors on the corresponding updated lists */
49         for (parent = cgroup_parent(cgrp); parent;
50              cgrp = parent, parent = cgroup_parent(cgrp)) {
51                 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
52                 struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
53
54                 /*
55                  * Both additions and removals are bottom-up.  If a cgroup
56                  * is already in the tree, all ancestors are.
57                  */
58                 if (rstatc->updated_next)
59                         break;
60
61                 rstatc->updated_next = prstatc->updated_children;
62                 prstatc->updated_children = cgrp;
63         }
64
65         raw_spin_unlock_irqrestore(cpu_lock, flags);
66 }
67
68 /**
69  * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
70  * @pos: current position
71  * @root: root of the tree to traversal
72  * @cpu: target cpu
73  *
74  * Walks the udpated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
75  * the traversal and %NULL return indicates the end.  During traversal,
76  * each returned cgroup is unlinked from the tree.  Must be called with the
77  * matching cgroup_rstat_cpu_lock held.
78  *
79  * The only ordering guarantee is that, for a parent and a child pair
80  * covered by a given traversal, if a child is visited, its parent is
81  * guaranteed to be visited afterwards.
82  */
83 static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
84                                                    struct cgroup *root, int cpu)
85 {
86         struct cgroup_rstat_cpu *rstatc;
87
88         if (pos == root)
89                 return NULL;
90
91         /*
92          * We're gonna walk down to the first leaf and visit/remove it.  We
93          * can pick whatever unvisited node as the starting point.
94          */
95         if (!pos)
96                 pos = root;
97         else
98                 pos = cgroup_parent(pos);
99
100         /* walk down to the first leaf */
101         while (true) {
102                 rstatc = cgroup_rstat_cpu(pos, cpu);
103                 if (rstatc->updated_children == pos)
104                         break;
105                 pos = rstatc->updated_children;
106         }
107
108         /*
109          * Unlink @pos from the tree.  As the updated_children list is
110          * singly linked, we have to walk it to find the removal point.
111          * However, due to the way we traverse, @pos will be the first
112          * child in most cases. The only exception is @root.
113          */
114         if (rstatc->updated_next) {
115                 struct cgroup *parent = cgroup_parent(pos);
116                 struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
117                 struct cgroup_rstat_cpu *nrstatc;
118                 struct cgroup **nextp;
119
120                 nextp = &prstatc->updated_children;
121                 while (true) {
122                         nrstatc = cgroup_rstat_cpu(*nextp, cpu);
123                         if (*nextp == pos)
124                                 break;
125
126                         WARN_ON_ONCE(*nextp == parent);
127                         nextp = &nrstatc->updated_next;
128                 }
129
130                 *nextp = rstatc->updated_next;
131                 rstatc->updated_next = NULL;
132
133                 return pos;
134         }
135
136         /* only happens for @root */
137         return NULL;
138 }
139
140 /* see cgroup_rstat_flush() */
141 static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
142         __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
143 {
144         int cpu;
145
146         lockdep_assert_held(&cgroup_rstat_lock);
147
148         for_each_possible_cpu(cpu) {
149                 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
150                                                        cpu);
151                 struct cgroup *pos = NULL;
152
153                 raw_spin_lock(cpu_lock);
154                 while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
155                         struct cgroup_subsys_state *css;
156
157                         cgroup_base_stat_flush(pos, cpu);
158
159                         rcu_read_lock();
160                         list_for_each_entry_rcu(css, &pos->rstat_css_list,
161                                                 rstat_css_node)
162                                 css->ss->css_rstat_flush(css, cpu);
163                         rcu_read_unlock();
164                 }
165                 raw_spin_unlock(cpu_lock);
166
167                 /* if @may_sleep, play nice and yield if necessary */
168                 if (may_sleep && (need_resched() ||
169                                   spin_needbreak(&cgroup_rstat_lock))) {
170                         spin_unlock_irq(&cgroup_rstat_lock);
171                         if (!cond_resched())
172                                 cpu_relax();
173                         spin_lock_irq(&cgroup_rstat_lock);
174                 }
175         }
176 }
177
178 /**
179  * cgroup_rstat_flush - flush stats in @cgrp's subtree
180  * @cgrp: target cgroup
181  *
182  * Collect all per-cpu stats in @cgrp's subtree into the global counters
183  * and propagate them upwards.  After this function returns, all cgroups in
184  * the subtree have up-to-date ->stat.
185  *
186  * This also gets all cgroups in the subtree including @cgrp off the
187  * ->updated_children lists.
188  *
189  * This function may block.
190  */
191 void cgroup_rstat_flush(struct cgroup *cgrp)
192 {
193         might_sleep();
194
195         spin_lock_irq(&cgroup_rstat_lock);
196         cgroup_rstat_flush_locked(cgrp, true);
197         spin_unlock_irq(&cgroup_rstat_lock);
198 }
199
200 /**
201  * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
202  * @cgrp: target cgroup
203  *
204  * This function can be called from any context.
205  */
206 void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
207 {
208         unsigned long flags;
209
210         spin_lock_irqsave(&cgroup_rstat_lock, flags);
211         cgroup_rstat_flush_locked(cgrp, false);
212         spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
213 }
214
215 /**
216  * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
217  * @cgrp: target cgroup
218  *
219  * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
220  * paired with cgroup_rstat_flush_release().
221  *
222  * This function may block.
223  */
224 void cgroup_rstat_flush_hold(struct cgroup *cgrp)
225         __acquires(&cgroup_rstat_lock)
226 {
227         might_sleep();
228         spin_lock_irq(&cgroup_rstat_lock);
229         cgroup_rstat_flush_locked(cgrp, true);
230 }
231
232 /**
233  * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
234  */
235 void cgroup_rstat_flush_release(void)
236         __releases(&cgroup_rstat_lock)
237 {
238         spin_unlock_irq(&cgroup_rstat_lock);
239 }
240
241 int cgroup_rstat_init(struct cgroup *cgrp)
242 {
243         int cpu;
244
245         /* the root cgrp has rstat_cpu preallocated */
246         if (!cgrp->rstat_cpu) {
247                 cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
248                 if (!cgrp->rstat_cpu)
249                         return -ENOMEM;
250         }
251
252         /* ->updated_children list is self terminated */
253         for_each_possible_cpu(cpu) {
254                 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
255
256                 rstatc->updated_children = cgrp;
257                 u64_stats_init(&rstatc->bsync);
258         }
259
260         return 0;
261 }
262
263 void cgroup_rstat_exit(struct cgroup *cgrp)
264 {
265         int cpu;
266
267         cgroup_rstat_flush(cgrp);
268
269         /* sanity check */
270         for_each_possible_cpu(cpu) {
271                 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
272
273                 if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
274                     WARN_ON_ONCE(rstatc->updated_next))
275                         return;
276         }
277
278         free_percpu(cgrp->rstat_cpu);
279         cgrp->rstat_cpu = NULL;
280 }
281
282 void __init cgroup_rstat_boot(void)
283 {
284         int cpu;
285
286         for_each_possible_cpu(cpu)
287                 raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
288
289         BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp));
290 }
291
292 /*
293  * Functions for cgroup basic resource statistics implemented on top of
294  * rstat.
295  */
296 static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
297                                  struct cgroup_base_stat *src_bstat)
298 {
299         dst_bstat->cputime.utime += src_bstat->cputime.utime;
300         dst_bstat->cputime.stime += src_bstat->cputime.stime;
301         dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
302 }
303
304 static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
305                                  struct cgroup_base_stat *src_bstat)
306 {
307         dst_bstat->cputime.utime -= src_bstat->cputime.utime;
308         dst_bstat->cputime.stime -= src_bstat->cputime.stime;
309         dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
310 }
311
312 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
313 {
314         struct cgroup *parent = cgroup_parent(cgrp);
315         struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
316         struct cgroup_base_stat cur, delta;
317         unsigned seq;
318
319         /* fetch the current per-cpu values */
320         do {
321                 seq = __u64_stats_fetch_begin(&rstatc->bsync);
322                 cur.cputime = rstatc->bstat.cputime;
323         } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
324
325         /* propagate percpu delta to global */
326         delta = cur;
327         cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
328         cgroup_base_stat_add(&cgrp->bstat, &delta);
329         cgroup_base_stat_add(&rstatc->last_bstat, &delta);
330
331         /* propagate global delta to parent */
332         if (parent) {
333                 delta = cgrp->bstat;
334                 cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
335                 cgroup_base_stat_add(&parent->bstat, &delta);
336                 cgroup_base_stat_add(&cgrp->last_bstat, &delta);
337         }
338 }
339
340 static struct cgroup_rstat_cpu *
341 cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
342 {
343         struct cgroup_rstat_cpu *rstatc;
344
345         rstatc = get_cpu_ptr(cgrp->rstat_cpu);
346         u64_stats_update_begin(&rstatc->bsync);
347         return rstatc;
348 }
349
350 static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
351                                                  struct cgroup_rstat_cpu *rstatc)
352 {
353         u64_stats_update_end(&rstatc->bsync);
354         cgroup_rstat_updated(cgrp, smp_processor_id());
355         put_cpu_ptr(rstatc);
356 }
357
358 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
359 {
360         struct cgroup_rstat_cpu *rstatc;
361
362         rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
363         rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
364         cgroup_base_stat_cputime_account_end(cgrp, rstatc);
365 }
366
367 void __cgroup_account_cputime_field(struct cgroup *cgrp,
368                                     enum cpu_usage_stat index, u64 delta_exec)
369 {
370         struct cgroup_rstat_cpu *rstatc;
371
372         rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
373
374         switch (index) {
375         case CPUTIME_USER:
376         case CPUTIME_NICE:
377                 rstatc->bstat.cputime.utime += delta_exec;
378                 break;
379         case CPUTIME_SYSTEM:
380         case CPUTIME_IRQ:
381         case CPUTIME_SOFTIRQ:
382                 rstatc->bstat.cputime.stime += delta_exec;
383                 break;
384         default:
385                 break;
386         }
387
388         cgroup_base_stat_cputime_account_end(cgrp, rstatc);
389 }
390
391 /*
392  * compute the cputime for the root cgroup by getting the per cpu data
393  * at a global level, then categorizing the fields in a manner consistent
394  * with how it is done by __cgroup_account_cputime_field for each bit of
395  * cpu time attributed to a cgroup.
396  */
397 static void root_cgroup_cputime(struct task_cputime *cputime)
398 {
399         int i;
400
401         cputime->stime = 0;
402         cputime->utime = 0;
403         cputime->sum_exec_runtime = 0;
404         for_each_possible_cpu(i) {
405                 struct kernel_cpustat kcpustat;
406                 u64 *cpustat = kcpustat.cpustat;
407                 u64 user = 0;
408                 u64 sys = 0;
409
410                 kcpustat_cpu_fetch(&kcpustat, i);
411
412                 user += cpustat[CPUTIME_USER];
413                 user += cpustat[CPUTIME_NICE];
414                 cputime->utime += user;
415
416                 sys += cpustat[CPUTIME_SYSTEM];
417                 sys += cpustat[CPUTIME_IRQ];
418                 sys += cpustat[CPUTIME_SOFTIRQ];
419                 cputime->stime += sys;
420
421                 cputime->sum_exec_runtime += user;
422                 cputime->sum_exec_runtime += sys;
423                 cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
424                 cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST];
425                 cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE];
426         }
427 }
428
429 void cgroup_base_stat_cputime_show(struct seq_file *seq)
430 {
431         struct cgroup *cgrp = seq_css(seq)->cgroup;
432         u64 usage, utime, stime;
433         struct task_cputime cputime;
434
435         if (cgroup_parent(cgrp)) {
436                 cgroup_rstat_flush_hold(cgrp);
437                 usage = cgrp->bstat.cputime.sum_exec_runtime;
438                 cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
439                                &utime, &stime);
440                 cgroup_rstat_flush_release();
441         } else {
442                 root_cgroup_cputime(&cputime);
443                 usage = cputime.sum_exec_runtime;
444                 utime = cputime.utime;
445                 stime = cputime.stime;
446         }
447
448         do_div(usage, NSEC_PER_USEC);
449         do_div(utime, NSEC_PER_USEC);
450         do_div(stime, NSEC_PER_USEC);
451
452         seq_printf(seq, "usage_usec %llu\n"
453                    "user_usec %llu\n"
454                    "system_usec %llu\n",
455                    usage, utime, stime);
456 }