kernel/cgroup/rstat.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 #include "cgroup-internal.h"
   3
   4 #include <linux/sched/cputime.h>
   5
   6 static DEFINE_SPINLOCK(cgroup_rstat_lock);
   7 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
   8
   9 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
  10
  11 static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
  12 {
  13         return per_cpu_ptr(cgrp->rstat_cpu, cpu);
  14 }
  15
  16 /**
  17  * cgroup_rstat_updated - keep track of updated rstat_cpu
  18  * @cgrp: target cgroup
  19  * @cpu: cpu on which rstat_cpu was updated
  20  *
  21  * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
  22  * rstat_cpu->updated_children list.  See the comment on top of
  23  * cgroup_rstat_cpu definition for details.
  24  */
  25 void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
  26 {
  27         raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
  28         unsigned long flags;
  29
  30         /*
  31          * Speculative already-on-list test. This may race leading to
  32          * temporary inaccuracies, which is fine.
  33          *
  34          * Because @parent's updated_children is terminated with @parent
  35          * instead of NULL, we can tell whether @cgrp is on the list by
  36          * testing the next pointer for NULL.
  37          */
  38         if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
  39                 return;
  40
  41         raw_spin_lock_irqsave(cpu_lock, flags);
  42
  43         /* put @cgrp and all ancestors on the corresponding updated lists */
  44         while (true) {
  45                 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
  46                 struct cgroup *parent = cgroup_parent(cgrp);
  47                 struct cgroup_rstat_cpu *prstatc;
  48
  49                 /*
  50                  * Both additions and removals are bottom-up.  If a cgroup
  51                  * is already in the tree, all ancestors are.
  52                  */
  53                 if (rstatc->updated_next)
  54                         break;
  55
  56                 /* Root has no parent to link it to, but mark it busy */
  57                 if (!parent) {
  58                         rstatc->updated_next = cgrp;
  59                         break;
  60                 }
  61
  62                 prstatc = cgroup_rstat_cpu(parent, cpu);
  63                 rstatc->updated_next = prstatc->updated_children;
  64                 prstatc->updated_children = cgrp;
  65
  66                 cgrp = parent;
  67         }
  68
  69         raw_spin_unlock_irqrestore(cpu_lock, flags);
  70 }
  71
  72 /**
  73  * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
  74  * @pos: current position
  75  * @root: root of the tree to traversal
  76  * @cpu: target cpu
  77  *
  78  * Walks the updated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
  79  * the traversal and %NULL return indicates the end.  During traversal,
  80  * each returned cgroup is unlinked from the tree.  Must be called with the
  81  * matching cgroup_rstat_cpu_lock held.
  82  *
  83  * The only ordering guarantee is that, for a parent and a child pair
  84  * covered by a given traversal, if a child is visited, its parent is
  85  * guaranteed to be visited afterwards.
  86  */
  87 static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
  88                                                    struct cgroup *root, int cpu)
  89 {
  90         struct cgroup_rstat_cpu *rstatc;
  91
  92         if (pos == root)
  93                 return NULL;
  94
  95         /*
  96          * We're gonna walk down to the first leaf and visit/remove it.  We
  97          * can pick whatever unvisited node as the starting point.
  98          */
  99         if (!pos)
 100                 pos = root;
 101         else
 102                 pos = cgroup_parent(pos);
 103
 104         /* walk down to the first leaf */
 105         while (true) {
 106                 rstatc = cgroup_rstat_cpu(pos, cpu);
 107                 if (rstatc->updated_children == pos)
 108                         break;
 109                 pos = rstatc->updated_children;
 110         }
 111
 112         /*
 113          * Unlink @pos from the tree.  As the updated_children list is
 114          * singly linked, we have to walk it to find the removal point.
 115          * However, due to the way we traverse, @pos will be the first
 116          * child in most cases. The only exception is @root.
 117          */
 118         if (rstatc->updated_next) {
 119                 struct cgroup *parent = cgroup_parent(pos);
 120
 121                 if (parent) {
 122                         struct cgroup_rstat_cpu *prstatc;
 123                         struct cgroup **nextp;
 124
 125                         prstatc = cgroup_rstat_cpu(parent, cpu);
 126                         nextp = &prstatc->updated_children;
 127                         while (true) {
 128                                 struct cgroup_rstat_cpu *nrstatc;
 129
 130                                 nrstatc = cgroup_rstat_cpu(*nextp, cpu);
 131                                 if (*nextp == pos)
 132                                         break;
 133                                 WARN_ON_ONCE(*nextp == parent);
 134                                 nextp = &nrstatc->updated_next;
 135                         }
 136                         *nextp = rstatc->updated_next;
 137                 }
 138
 139                 rstatc->updated_next = NULL;
 140                 return pos;
 141         }
 142
 143         /* only happens for @root */
 144         return NULL;
 145 }
 146
 147 /* see cgroup_rstat_flush() */
 148 static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
 149         __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
 150 {
 151         int cpu;
 152
 153         lockdep_assert_held(&cgroup_rstat_lock);
 154
 155         for_each_possible_cpu(cpu) {
 156                 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
 157                                                        cpu);
 158                 struct cgroup *pos = NULL;
 159
 160                 raw_spin_lock(cpu_lock);
 161                 while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
 162                         struct cgroup_subsys_state *css;
 163
 164                         cgroup_base_stat_flush(pos, cpu);
 165
 166                         rcu_read_lock();
 167                         list_for_each_entry_rcu(css, &pos->rstat_css_list,
 168                                                 rstat_css_node)
 169                                 css->ss->css_rstat_flush(css, cpu);
 170                         rcu_read_unlock();
 171                 }
 172                 raw_spin_unlock(cpu_lock);
 173
 174                 /* if @may_sleep, play nice and yield if necessary */
 175                 if (may_sleep && (need_resched() ||
 176                                   spin_needbreak(&cgroup_rstat_lock))) {
 177                         spin_unlock_irq(&cgroup_rstat_lock);
 178                         if (!cond_resched())
 179                                 cpu_relax();
 180                         spin_lock_irq(&cgroup_rstat_lock);
 181                 }
 182         }
 183 }
 184
 185 /**
 186  * cgroup_rstat_flush - flush stats in @cgrp's subtree
 187  * @cgrp: target cgroup
 188  *
 189  * Collect all per-cpu stats in @cgrp's subtree into the global counters
 190  * and propagate them upwards.  After this function returns, all cgroups in
 191  * the subtree have up-to-date ->stat.
 192  *
 193  * This also gets all cgroups in the subtree including @cgrp off the
 194  * ->updated_children lists.
 195  *
 196  * This function may block.
 197  */
 198 void cgroup_rstat_flush(struct cgroup *cgrp)
 199 {
 200         might_sleep();
 201
 202         spin_lock_irq(&cgroup_rstat_lock);
 203         cgroup_rstat_flush_locked(cgrp, true);
 204         spin_unlock_irq(&cgroup_rstat_lock);
 205 }
 206
 207 /**
 208  * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
 209  * @cgrp: target cgroup
 210  *
 211  * This function can be called from any context.
 212  */
 213 void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
 214 {
 215         unsigned long flags;
 216
 217         spin_lock_irqsave(&cgroup_rstat_lock, flags);
 218         cgroup_rstat_flush_locked(cgrp, false);
 219         spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
 220 }
 221
 222 /**
 223  * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
 224  * @cgrp: target cgroup
 225  *
 226  * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
 227  * paired with cgroup_rstat_flush_release().
 228  *
 229  * This function may block.
 230  */
 231 void cgroup_rstat_flush_hold(struct cgroup *cgrp)
 232         __acquires(&cgroup_rstat_lock)
 233 {
 234         might_sleep();
 235         spin_lock_irq(&cgroup_rstat_lock);
 236         cgroup_rstat_flush_locked(cgrp, true);
 237 }
 238
 239 /**
 240  * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
 241  */
 242 void cgroup_rstat_flush_release(void)
 243         __releases(&cgroup_rstat_lock)
 244 {
 245         spin_unlock_irq(&cgroup_rstat_lock);
 246 }
 247
 248 int cgroup_rstat_init(struct cgroup *cgrp)
 249 {
 250         int cpu;
 251
 252         /* the root cgrp has rstat_cpu preallocated */
 253         if (!cgrp->rstat_cpu) {
 254                 cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
 255                 if (!cgrp->rstat_cpu)
 256                         return -ENOMEM;
 257         }
 258
 259         /* ->updated_children list is self terminated */
 260         for_each_possible_cpu(cpu) {
 261                 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 262
 263                 rstatc->updated_children = cgrp;
 264                 u64_stats_init(&rstatc->bsync);
 265         }
 266
 267         return 0;
 268 }
 269
 270 void cgroup_rstat_exit(struct cgroup *cgrp)
 271 {
 272         int cpu;
 273
 274         cgroup_rstat_flush(cgrp);
 275
 276         /* sanity check */
 277         for_each_possible_cpu(cpu) {
 278                 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 279
 280                 if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
 281                     WARN_ON_ONCE(rstatc->updated_next))
 282                         return;
 283         }
 284
 285         free_percpu(cgrp->rstat_cpu);
 286         cgrp->rstat_cpu = NULL;
 287 }
 288
 289 void __init cgroup_rstat_boot(void)
 290 {
 291         int cpu;
 292
 293         for_each_possible_cpu(cpu)
 294                 raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
 295 }
 296
 297 /*
 298  * Functions for cgroup basic resource statistics implemented on top of
 299  * rstat.
 300  */
 301 static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
 302                                  struct cgroup_base_stat *src_bstat)
 303 {
 304         dst_bstat->cputime.utime += src_bstat->cputime.utime;
 305         dst_bstat->cputime.stime += src_bstat->cputime.stime;
 306         dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
 307 }
 308
 309 static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
 310                                  struct cgroup_base_stat *src_bstat)
 311 {
 312         dst_bstat->cputime.utime -= src_bstat->cputime.utime;
 313         dst_bstat->cputime.stime -= src_bstat->cputime.stime;
 314         dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
 315 }
 316
 317 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
 318 {
 319         struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 320         struct cgroup *parent = cgroup_parent(cgrp);
 321         struct cgroup_base_stat cur, delta;
 322         unsigned seq;
 323
 324         /* Root-level stats are sourced from system-wide CPU stats */
 325         if (!parent)
 326                 return;
 327
 328         /* fetch the current per-cpu values */
 329         do {
 330                 seq = __u64_stats_fetch_begin(&rstatc->bsync);
 331                 cur.cputime = rstatc->bstat.cputime;
 332         } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
 333
 334         /* propagate percpu delta to global */
 335         delta = cur;
 336         cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
 337         cgroup_base_stat_add(&cgrp->bstat, &delta);
 338         cgroup_base_stat_add(&rstatc->last_bstat, &delta);
 339
 340         /* propagate global delta to parent (unless that's root) */
 341         if (cgroup_parent(parent)) {
 342                 delta = cgrp->bstat;
 343                 cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
 344                 cgroup_base_stat_add(&parent->bstat, &delta);
 345                 cgroup_base_stat_add(&cgrp->last_bstat, &delta);
 346         }
 347 }
 348
 349 static struct cgroup_rstat_cpu *
 350 cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
 351 {
 352         struct cgroup_rstat_cpu *rstatc;
 353
 354         rstatc = get_cpu_ptr(cgrp->rstat_cpu);
 355         *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
 356         return rstatc;
 357 }
 358
 359 static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
 360                                                  struct cgroup_rstat_cpu *rstatc,
 361                                                  unsigned long flags)
 362 {
 363         u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
 364         cgroup_rstat_updated(cgrp, smp_processor_id());
 365         put_cpu_ptr(rstatc);
 366 }
 367
 368 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
 369 {
 370         struct cgroup_rstat_cpu *rstatc;
 371         unsigned long flags;
 372
 373         rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
 374         rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
 375         cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
 376 }
 377
 378 void __cgroup_account_cputime_field(struct cgroup *cgrp,
 379                                     enum cpu_usage_stat index, u64 delta_exec)
 380 {
 381         struct cgroup_rstat_cpu *rstatc;
 382         unsigned long flags;
 383
 384         rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
 385
 386         switch (index) {
 387         case CPUTIME_USER:
 388         case CPUTIME_NICE:
 389                 rstatc->bstat.cputime.utime += delta_exec;
 390                 break;
 391         case CPUTIME_SYSTEM:
 392         case CPUTIME_IRQ:
 393         case CPUTIME_SOFTIRQ:
 394                 rstatc->bstat.cputime.stime += delta_exec;
 395                 break;
 396         default:
 397                 break;
 398         }
 399
 400         cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
 401 }
 402
 403 /*
 404  * compute the cputime for the root cgroup by getting the per cpu data
 405  * at a global level, then categorizing the fields in a manner consistent
 406  * with how it is done by __cgroup_account_cputime_field for each bit of
 407  * cpu time attributed to a cgroup.
 408  */
 409 static void root_cgroup_cputime(struct task_cputime *cputime)
 410 {
 411         int i;
 412
 413         cputime->stime = 0;
 414         cputime->utime = 0;
 415         cputime->sum_exec_runtime = 0;
 416         for_each_possible_cpu(i) {
 417                 struct kernel_cpustat kcpustat;
 418                 u64 *cpustat = kcpustat.cpustat;
 419                 u64 user = 0;
 420                 u64 sys = 0;
 421
 422                 kcpustat_cpu_fetch(&kcpustat, i);
 423
 424                 user += cpustat[CPUTIME_USER];
 425                 user += cpustat[CPUTIME_NICE];
 426                 cputime->utime += user;
 427
 428                 sys += cpustat[CPUTIME_SYSTEM];
 429                 sys += cpustat[CPUTIME_IRQ];
 430                 sys += cpustat[CPUTIME_SOFTIRQ];
 431                 cputime->stime += sys;
 432
 433                 cputime->sum_exec_runtime += user;
 434                 cputime->sum_exec_runtime += sys;
 435                 cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
 436                 cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST];
 437                 cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE];
 438         }
 439 }
 440
 441 void cgroup_base_stat_cputime_show(struct seq_file *seq)
 442 {
 443         struct cgroup *cgrp = seq_css(seq)->cgroup;
 444         u64 usage, utime, stime;
 445         struct task_cputime cputime;
 446
 447         if (cgroup_parent(cgrp)) {
 448                 cgroup_rstat_flush_hold(cgrp);
 449                 usage = cgrp->bstat.cputime.sum_exec_runtime;
 450                 cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
 451                                &utime, &stime);
 452                 cgroup_rstat_flush_release();
 453         } else {
 454                 root_cgroup_cputime(&cputime);
 455                 usage = cputime.sum_exec_runtime;
 456                 utime = cputime.utime;
 457                 stime = cputime.stime;
 458         }
 459
 460         do_div(usage, NSEC_PER_USEC);
 461         do_div(utime, NSEC_PER_USEC);
 462         do_div(stime, NSEC_PER_USEC);
 463
 464         seq_printf(seq, "usage_usec %llu\n"
 465                    "user_usec %llu\n"
 466                    "system_usec %llu\n",
 467                    usage, utime, stime);
 468 }