kernel/cgroup/rstat.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 #include "cgroup-internal.h"
   3
   4 #include <linux/sched/cputime.h>
   5
   6 static DEFINE_SPINLOCK(cgroup_rstat_lock);
   7 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
   8
   9 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
  10
  11 static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
  12 {
  13         return per_cpu_ptr(cgrp->rstat_cpu, cpu);
  14 }
  15
  16 /**
  17  * cgroup_rstat_updated - keep track of updated rstat_cpu
  18  * @cgrp: target cgroup
  19  * @cpu: cpu on which rstat_cpu was updated
  20  *
  21  * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
  22  * rstat_cpu->updated_children list.  See the comment on top of
  23  * cgroup_rstat_cpu definition for details.
  24  */
  25 void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
  26 {
  27         raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
  28         unsigned long flags;
  29
  30         /*
  31          * Speculative already-on-list test. This may race leading to
  32          * temporary inaccuracies, which is fine.
  33          *
  34          * Because @parent's updated_children is terminated with @parent
  35          * instead of NULL, we can tell whether @cgrp is on the list by
  36          * testing the next pointer for NULL.
  37          */
  38         if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
  39                 return;
  40
  41         raw_spin_lock_irqsave(cpu_lock, flags);
  42
  43         /* put @cgrp and all ancestors on the corresponding updated lists */
  44         while (true) {
  45                 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
  46                 struct cgroup *parent = cgroup_parent(cgrp);
  47                 struct cgroup_rstat_cpu *prstatc;
  48
  49                 /*
  50                  * Both additions and removals are bottom-up.  If a cgroup
  51                  * is already in the tree, all ancestors are.
  52                  */
  53                 if (rstatc->updated_next)
  54                         break;
  55
  56                 /* Root has no parent to link it to, but mark it busy */
  57                 if (!parent) {
  58                         rstatc->updated_next = cgrp;
  59                         break;
  60                 }
  61
  62                 prstatc = cgroup_rstat_cpu(parent, cpu);
  63                 rstatc->updated_next = prstatc->updated_children;
  64                 prstatc->updated_children = cgrp;
  65
  66                 cgrp = parent;
  67         }
  68
  69         raw_spin_unlock_irqrestore(cpu_lock, flags);
  70 }
  71
  72 /**
  73  * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
  74  * @pos: current position
  75  * @root: root of the tree to traversal
  76  * @cpu: target cpu
  77  *
  78  * Walks the updated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
  79  * the traversal and %NULL return indicates the end.  During traversal,
  80  * each returned cgroup is unlinked from the tree.  Must be called with the
  81  * matching cgroup_rstat_cpu_lock held.
  82  *
  83  * The only ordering guarantee is that, for a parent and a child pair
  84  * covered by a given traversal, if a child is visited, its parent is
  85  * guaranteed to be visited afterwards.
  86  */
  87 static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
  88                                                    struct cgroup *root, int cpu)
  89 {
  90         struct cgroup_rstat_cpu *rstatc;
  91         struct cgroup *parent;
  92
  93         if (pos == root)
  94                 return NULL;
  95
  96         /*
  97          * We're gonna walk down to the first leaf and visit/remove it.  We
  98          * can pick whatever unvisited node as the starting point.
  99          */
 100         if (!pos) {
 101                 pos = root;
 102                 /* return NULL if this subtree is not on-list */
 103                 if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
 104                         return NULL;
 105         } else {
 106                 pos = cgroup_parent(pos);
 107         }
 108
 109         /* walk down to the first leaf */
 110         while (true) {
 111                 rstatc = cgroup_rstat_cpu(pos, cpu);
 112                 if (rstatc->updated_children == pos)
 113                         break;
 114                 pos = rstatc->updated_children;
 115         }
 116
 117         /*
 118          * Unlink @pos from the tree.  As the updated_children list is
 119          * singly linked, we have to walk it to find the removal point.
 120          * However, due to the way we traverse, @pos will be the first
 121          * child in most cases. The only exception is @root.
 122          */
 123         parent = cgroup_parent(pos);
 124         if (parent) {
 125                 struct cgroup_rstat_cpu *prstatc;
 126                 struct cgroup **nextp;
 127
 128                 prstatc = cgroup_rstat_cpu(parent, cpu);
 129                 nextp = &prstatc->updated_children;
 130                 while (*nextp != pos) {
 131                         struct cgroup_rstat_cpu *nrstatc;
 132
 133                         nrstatc = cgroup_rstat_cpu(*nextp, cpu);
 134                         WARN_ON_ONCE(*nextp == parent);
 135                         nextp = &nrstatc->updated_next;
 136                 }
 137                 *nextp = rstatc->updated_next;
 138         }
 139
 140         rstatc->updated_next = NULL;
 141         return pos;
 142 }
 143
 144 /* see cgroup_rstat_flush() */
 145 static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
 146         __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
 147 {
 148         int cpu;
 149
 150         lockdep_assert_held(&cgroup_rstat_lock);
 151
 152         for_each_possible_cpu(cpu) {
 153                 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
 154                                                        cpu);
 155                 struct cgroup *pos = NULL;
 156                 unsigned long flags;
 157
 158                 /*
 159                  * The _irqsave() is needed because cgroup_rstat_lock is
 160                  * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
 161                  * this lock with the _irq() suffix only disables interrupts on
 162                  * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
 163                  * interrupts on both configurations. The _irqsave() ensures
 164                  * that interrupts are always disabled and later restored.
 165                  */
 166                 raw_spin_lock_irqsave(cpu_lock, flags);
 167                 while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
 168                         struct cgroup_subsys_state *css;
 169
 170                         cgroup_base_stat_flush(pos, cpu);
 171
 172                         rcu_read_lock();
 173                         list_for_each_entry_rcu(css, &pos->rstat_css_list,
 174                                                 rstat_css_node)
 175                                 css->ss->css_rstat_flush(css, cpu);
 176                         rcu_read_unlock();
 177                 }
 178                 raw_spin_unlock_irqrestore(cpu_lock, flags);
 179
 180                 /* if @may_sleep, play nice and yield if necessary */
 181                 if (may_sleep && (need_resched() ||
 182                                   spin_needbreak(&cgroup_rstat_lock))) {
 183                         spin_unlock_irq(&cgroup_rstat_lock);
 184                         if (!cond_resched())
 185                                 cpu_relax();
 186                         spin_lock_irq(&cgroup_rstat_lock);
 187                 }
 188         }
 189 }
 190
 191 /**
 192  * cgroup_rstat_flush - flush stats in @cgrp's subtree
 193  * @cgrp: target cgroup
 194  *
 195  * Collect all per-cpu stats in @cgrp's subtree into the global counters
 196  * and propagate them upwards.  After this function returns, all cgroups in
 197  * the subtree have up-to-date ->stat.
 198  *
 199  * This also gets all cgroups in the subtree including @cgrp off the
 200  * ->updated_children lists.
 201  *
 202  * This function may block.
 203  */
 204 void cgroup_rstat_flush(struct cgroup *cgrp)
 205 {
 206         might_sleep();
 207
 208         spin_lock_irq(&cgroup_rstat_lock);
 209         cgroup_rstat_flush_locked(cgrp, true);
 210         spin_unlock_irq(&cgroup_rstat_lock);
 211 }
 212
 213 /**
 214  * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
 215  * @cgrp: target cgroup
 216  *
 217  * This function can be called from any context.
 218  */
 219 void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
 220 {
 221         unsigned long flags;
 222
 223         spin_lock_irqsave(&cgroup_rstat_lock, flags);
 224         cgroup_rstat_flush_locked(cgrp, false);
 225         spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
 226 }
 227
 228 /**
 229  * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
 230  * @cgrp: target cgroup
 231  *
 232  * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
 233  * paired with cgroup_rstat_flush_release().
 234  *
 235  * This function may block.
 236  */
 237 void cgroup_rstat_flush_hold(struct cgroup *cgrp)
 238         __acquires(&cgroup_rstat_lock)
 239 {
 240         might_sleep();
 241         spin_lock_irq(&cgroup_rstat_lock);
 242         cgroup_rstat_flush_locked(cgrp, true);
 243 }
 244
 245 /**
 246  * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
 247  */
 248 void cgroup_rstat_flush_release(void)
 249         __releases(&cgroup_rstat_lock)
 250 {
 251         spin_unlock_irq(&cgroup_rstat_lock);
 252 }
 253
 254 int cgroup_rstat_init(struct cgroup *cgrp)
 255 {
 256         int cpu;
 257
 258         /* the root cgrp has rstat_cpu preallocated */
 259         if (!cgrp->rstat_cpu) {
 260                 cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
 261                 if (!cgrp->rstat_cpu)
 262                         return -ENOMEM;
 263         }
 264
 265         /* ->updated_children list is self terminated */
 266         for_each_possible_cpu(cpu) {
 267                 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 268
 269                 rstatc->updated_children = cgrp;
 270                 u64_stats_init(&rstatc->bsync);
 271         }
 272
 273         return 0;
 274 }
 275
 276 void cgroup_rstat_exit(struct cgroup *cgrp)
 277 {
 278         int cpu;
 279
 280         cgroup_rstat_flush(cgrp);
 281
 282         /* sanity check */
 283         for_each_possible_cpu(cpu) {
 284                 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 285
 286                 if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
 287                     WARN_ON_ONCE(rstatc->updated_next))
 288                         return;
 289         }
 290
 291         free_percpu(cgrp->rstat_cpu);
 292         cgrp->rstat_cpu = NULL;
 293 }
 294
 295 void __init cgroup_rstat_boot(void)
 296 {
 297         int cpu;
 298
 299         for_each_possible_cpu(cpu)
 300                 raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
 301 }
 302
 303 /*
 304  * Functions for cgroup basic resource statistics implemented on top of
 305  * rstat.
 306  */
 307 static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
 308                                  struct cgroup_base_stat *src_bstat)
 309 {
 310         dst_bstat->cputime.utime += src_bstat->cputime.utime;
 311         dst_bstat->cputime.stime += src_bstat->cputime.stime;
 312         dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
 313 }
 314
 315 static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
 316                                  struct cgroup_base_stat *src_bstat)
 317 {
 318         dst_bstat->cputime.utime -= src_bstat->cputime.utime;
 319         dst_bstat->cputime.stime -= src_bstat->cputime.stime;
 320         dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
 321 }
 322
 323 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
 324 {
 325         struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 326         struct cgroup *parent = cgroup_parent(cgrp);
 327         struct cgroup_base_stat delta;
 328         unsigned seq;
 329
 330         /* Root-level stats are sourced from system-wide CPU stats */
 331         if (!parent)
 332                 return;
 333
 334         /* fetch the current per-cpu values */
 335         do {
 336                 seq = __u64_stats_fetch_begin(&rstatc->bsync);
 337                 delta = rstatc->bstat;
 338         } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
 339
 340         /* propagate percpu delta to global */
 341         cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
 342         cgroup_base_stat_add(&cgrp->bstat, &delta);
 343         cgroup_base_stat_add(&rstatc->last_bstat, &delta);
 344
 345         /* propagate global delta to parent (unless that's root) */
 346         if (cgroup_parent(parent)) {
 347                 delta = cgrp->bstat;
 348                 cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
 349                 cgroup_base_stat_add(&parent->bstat, &delta);
 350                 cgroup_base_stat_add(&cgrp->last_bstat, &delta);
 351         }
 352 }
 353
 354 static struct cgroup_rstat_cpu *
 355 cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
 356 {
 357         struct cgroup_rstat_cpu *rstatc;
 358
 359         rstatc = get_cpu_ptr(cgrp->rstat_cpu);
 360         *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
 361         return rstatc;
 362 }
 363
 364 static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
 365                                                  struct cgroup_rstat_cpu *rstatc,
 366                                                  unsigned long flags)
 367 {
 368         u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
 369         cgroup_rstat_updated(cgrp, smp_processor_id());
 370         put_cpu_ptr(rstatc);
 371 }
 372
 373 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
 374 {
 375         struct cgroup_rstat_cpu *rstatc;
 376         unsigned long flags;
 377
 378         rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
 379         rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
 380         cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
 381 }
 382
 383 void __cgroup_account_cputime_field(struct cgroup *cgrp,
 384                                     enum cpu_usage_stat index, u64 delta_exec)
 385 {
 386         struct cgroup_rstat_cpu *rstatc;
 387         unsigned long flags;
 388
 389         rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
 390
 391         switch (index) {
 392         case CPUTIME_USER:
 393         case CPUTIME_NICE:
 394                 rstatc->bstat.cputime.utime += delta_exec;
 395                 break;
 396         case CPUTIME_SYSTEM:
 397         case CPUTIME_IRQ:
 398         case CPUTIME_SOFTIRQ:
 399                 rstatc->bstat.cputime.stime += delta_exec;
 400                 break;
 401         default:
 402                 break;
 403         }
 404
 405         cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
 406 }
 407
 408 /*
 409  * compute the cputime for the root cgroup by getting the per cpu data
 410  * at a global level, then categorizing the fields in a manner consistent
 411  * with how it is done by __cgroup_account_cputime_field for each bit of
 412  * cpu time attributed to a cgroup.
 413  */
 414 static void root_cgroup_cputime(struct task_cputime *cputime)
 415 {
 416         int i;
 417
 418         cputime->stime = 0;
 419         cputime->utime = 0;
 420         cputime->sum_exec_runtime = 0;
 421         for_each_possible_cpu(i) {
 422                 struct kernel_cpustat kcpustat;
 423                 u64 *cpustat = kcpustat.cpustat;
 424                 u64 user = 0;
 425                 u64 sys = 0;
 426
 427                 kcpustat_cpu_fetch(&kcpustat, i);
 428
 429                 user += cpustat[CPUTIME_USER];
 430                 user += cpustat[CPUTIME_NICE];
 431                 cputime->utime += user;
 432
 433                 sys += cpustat[CPUTIME_SYSTEM];
 434                 sys += cpustat[CPUTIME_IRQ];
 435                 sys += cpustat[CPUTIME_SOFTIRQ];
 436                 cputime->stime += sys;
 437
 438                 cputime->sum_exec_runtime += user;
 439                 cputime->sum_exec_runtime += sys;
 440                 cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
 441         }
 442 }
 443
 444 void cgroup_base_stat_cputime_show(struct seq_file *seq)
 445 {
 446         struct cgroup *cgrp = seq_css(seq)->cgroup;
 447         u64 usage, utime, stime;
 448         struct task_cputime cputime;
 449
 450         if (cgroup_parent(cgrp)) {
 451                 cgroup_rstat_flush_hold(cgrp);
 452                 usage = cgrp->bstat.cputime.sum_exec_runtime;
 453                 cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
 454                                &utime, &stime);
 455                 cgroup_rstat_flush_release();
 456         } else {
 457                 root_cgroup_cputime(&cputime);
 458                 usage = cputime.sum_exec_runtime;
 459                 utime = cputime.utime;
 460                 stime = cputime.stime;
 461         }
 462
 463         do_div(usage, NSEC_PER_USEC);
 464         do_div(utime, NSEC_PER_USEC);
 465         do_div(stime, NSEC_PER_USEC);
 466
 467         seq_printf(seq, "usage_usec %llu\n"
 468                    "user_usec %llu\n"
 469                    "system_usec %llu\n",
 470                    usage, utime, stime);
 471 }