kernel/cgroup/rstat.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 #include "cgroup-internal.h"
   3
   4 #include <linux/sched/cputime.h>
   5
   6 static DEFINE_SPINLOCK(cgroup_rstat_lock);
   7 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
   8
   9 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
  10
  11 static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
  12 {
  13         return per_cpu_ptr(cgrp->rstat_cpu, cpu);
  14 }
  15
  16 /**
  17  * cgroup_rstat_updated - keep track of updated rstat_cpu
  18  * @cgrp: target cgroup
  19  * @cpu: cpu on which rstat_cpu was updated
  20  *
  21  * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
  22  * rstat_cpu->updated_children list.  See the comment on top of
  23  * cgroup_rstat_cpu definition for details.
  24  */
  25 void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
  26 {
  27         raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
  28         unsigned long flags;
  29
  30         /*
  31          * Speculative already-on-list test. This may race leading to
  32          * temporary inaccuracies, which is fine.
  33          *
  34          * Because @parent's updated_children is terminated with @parent
  35          * instead of NULL, we can tell whether @cgrp is on the list by
  36          * testing the next pointer for NULL.
  37          */
  38         if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
  39                 return;
  40
  41         raw_spin_lock_irqsave(cpu_lock, flags);
  42
  43         /* put @cgrp and all ancestors on the corresponding updated lists */
  44         while (true) {
  45                 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
  46                 struct cgroup *parent = cgroup_parent(cgrp);
  47                 struct cgroup_rstat_cpu *prstatc;
  48
  49                 /*
  50                  * Both additions and removals are bottom-up.  If a cgroup
  51                  * is already in the tree, all ancestors are.
  52                  */
  53                 if (rstatc->updated_next)
  54                         break;
  55
  56                 /* Root has no parent to link it to, but mark it busy */
  57                 if (!parent) {
  58                         rstatc->updated_next = cgrp;
  59                         break;
  60                 }
  61
  62                 prstatc = cgroup_rstat_cpu(parent, cpu);
  63                 rstatc->updated_next = prstatc->updated_children;
  64                 prstatc->updated_children = cgrp;
  65
  66                 cgrp = parent;
  67         }
  68
  69         raw_spin_unlock_irqrestore(cpu_lock, flags);
  70 }
  71
  72 /**
  73  * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
  74  * @pos: current position
  75  * @root: root of the tree to traversal
  76  * @cpu: target cpu
  77  *
  78  * Walks the updated rstat_cpu tree on @cpu from @root.  %NULL @pos starts
  79  * the traversal and %NULL return indicates the end.  During traversal,
  80  * each returned cgroup is unlinked from the tree.  Must be called with the
  81  * matching cgroup_rstat_cpu_lock held.
  82  *
  83  * The only ordering guarantee is that, for a parent and a child pair
  84  * covered by a given traversal, if a child is visited, its parent is
  85  * guaranteed to be visited afterwards.
  86  */
  87 static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
  88                                                    struct cgroup *root, int cpu)
  89 {
  90         struct cgroup_rstat_cpu *rstatc;
  91         struct cgroup *parent;
  92
  93         if (pos == root)
  94                 return NULL;
  95
  96         /*
  97          * We're gonna walk down to the first leaf and visit/remove it.  We
  98          * can pick whatever unvisited node as the starting point.
  99          */
 100         if (!pos) {
 101                 pos = root;
 102                 /* return NULL if this subtree is not on-list */
 103                 if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
 104                         return NULL;
 105         } else {
 106                 pos = cgroup_parent(pos);
 107         }
 108
 109         /* walk down to the first leaf */
 110         while (true) {
 111                 rstatc = cgroup_rstat_cpu(pos, cpu);
 112                 if (rstatc->updated_children == pos)
 113                         break;
 114                 pos = rstatc->updated_children;
 115         }
 116
 117         /*
 118          * Unlink @pos from the tree.  As the updated_children list is
 119          * singly linked, we have to walk it to find the removal point.
 120          * However, due to the way we traverse, @pos will be the first
 121          * child in most cases. The only exception is @root.
 122          */
 123         parent = cgroup_parent(pos);
 124         if (parent) {
 125                 struct cgroup_rstat_cpu *prstatc;
 126                 struct cgroup **nextp;
 127
 128                 prstatc = cgroup_rstat_cpu(parent, cpu);
 129                 nextp = &prstatc->updated_children;
 130                 while (*nextp != pos) {
 131                         struct cgroup_rstat_cpu *nrstatc;
 132
 133                         nrstatc = cgroup_rstat_cpu(*nextp, cpu);
 134                         WARN_ON_ONCE(*nextp == parent);
 135                         nextp = &nrstatc->updated_next;
 136                 }
 137                 *nextp = rstatc->updated_next;
 138         }
 139
 140         rstatc->updated_next = NULL;
 141         return pos;
 142 }
 143
 144 /* see cgroup_rstat_flush() */
 145 static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
 146         __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
 147 {
 148         int cpu;
 149
 150         lockdep_assert_held(&cgroup_rstat_lock);
 151
 152         for_each_possible_cpu(cpu) {
 153                 raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
 154                                                        cpu);
 155                 struct cgroup *pos = NULL;
 156                 unsigned long flags;
 157
 158                 /*
 159                  * The _irqsave() is needed because cgroup_rstat_lock is
 160                  * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
 161                  * this lock with the _irq() suffix only disables interrupts on
 162                  * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
 163                  * interrupts on both configurations. The _irqsave() ensures
 164                  * that interrupts are always disabled and later restored.
 165                  */
 166                 raw_spin_lock_irqsave(cpu_lock, flags);
 167                 while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
 168                         struct cgroup_subsys_state *css;
 169
 170                         cgroup_base_stat_flush(pos, cpu);
 171
 172                         rcu_read_lock();
 173                         list_for_each_entry_rcu(css, &pos->rstat_css_list,
 174                                                 rstat_css_node)
 175                                 css->ss->css_rstat_flush(css, cpu);
 176                         rcu_read_unlock();
 177                 }
 178                 raw_spin_unlock_irqrestore(cpu_lock, flags);
 179
 180                 /* if @may_sleep, play nice and yield if necessary */
 181                 if (may_sleep && (need_resched() ||
 182                                   spin_needbreak(&cgroup_rstat_lock))) {
 183                         spin_unlock_irq(&cgroup_rstat_lock);
 184                         if (!cond_resched())
 185                                 cpu_relax();
 186                         spin_lock_irq(&cgroup_rstat_lock);
 187                 }
 188         }
 189 }
 190
 191 /**
 192  * cgroup_rstat_flush - flush stats in @cgrp's subtree
 193  * @cgrp: target cgroup
 194  *
 195  * Collect all per-cpu stats in @cgrp's subtree into the global counters
 196  * and propagate them upwards.  After this function returns, all cgroups in
 197  * the subtree have up-to-date ->stat.
 198  *
 199  * This also gets all cgroups in the subtree including @cgrp off the
 200  * ->updated_children lists.
 201  *
 202  * This function may block.
 203  */
 204 void cgroup_rstat_flush(struct cgroup *cgrp)
 205 {
 206         might_sleep();
 207
 208         spin_lock_irq(&cgroup_rstat_lock);
 209         cgroup_rstat_flush_locked(cgrp, true);
 210         spin_unlock_irq(&cgroup_rstat_lock);
 211 }
 212
 213 /**
 214  * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
 215  * @cgrp: target cgroup
 216  *
 217  * This function can be called from any context.
 218  */
 219 void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
 220 {
 221         unsigned long flags;
 222
 223         spin_lock_irqsave(&cgroup_rstat_lock, flags);
 224         cgroup_rstat_flush_locked(cgrp, false);
 225         spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
 226 }
 227
 228 /**
 229  * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
 230  * @cgrp: target cgroup
 231  *
 232  * Flush stats in @cgrp's subtree and prevent further flushes.  Must be
 233  * paired with cgroup_rstat_flush_release().
 234  *
 235  * This function may block.
 236  */
 237 void cgroup_rstat_flush_hold(struct cgroup *cgrp)
 238         __acquires(&cgroup_rstat_lock)
 239 {
 240         might_sleep();
 241         spin_lock_irq(&cgroup_rstat_lock);
 242         cgroup_rstat_flush_locked(cgrp, true);
 243 }
 244
 245 /**
 246  * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
 247  */
 248 void cgroup_rstat_flush_release(void)
 249         __releases(&cgroup_rstat_lock)
 250 {
 251         spin_unlock_irq(&cgroup_rstat_lock);
 252 }
 253
 254 int cgroup_rstat_init(struct cgroup *cgrp)
 255 {
 256         int cpu;
 257
 258         /* the root cgrp has rstat_cpu preallocated */
 259         if (!cgrp->rstat_cpu) {
 260                 cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
 261                 if (!cgrp->rstat_cpu)
 262                         return -ENOMEM;
 263         }
 264
 265         /* ->updated_children list is self terminated */
 266         for_each_possible_cpu(cpu) {
 267                 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 268
 269                 rstatc->updated_children = cgrp;
 270                 u64_stats_init(&rstatc->bsync);
 271         }
 272
 273         return 0;
 274 }
 275
 276 void cgroup_rstat_exit(struct cgroup *cgrp)
 277 {
 278         int cpu;
 279
 280         cgroup_rstat_flush(cgrp);
 281
 282         /* sanity check */
 283         for_each_possible_cpu(cpu) {
 284                 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 285
 286                 if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
 287                     WARN_ON_ONCE(rstatc->updated_next))
 288                         return;
 289         }
 290
 291         free_percpu(cgrp->rstat_cpu);
 292         cgrp->rstat_cpu = NULL;
 293 }
 294
 295 void __init cgroup_rstat_boot(void)
 296 {
 297         int cpu;
 298
 299         for_each_possible_cpu(cpu)
 300                 raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
 301 }
 302
 303 /*
 304  * Functions for cgroup basic resource statistics implemented on top of
 305  * rstat.
 306  */
 307 static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
 308                                  struct cgroup_base_stat *src_bstat)
 309 {
 310         dst_bstat->cputime.utime += src_bstat->cputime.utime;
 311         dst_bstat->cputime.stime += src_bstat->cputime.stime;
 312         dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
 313 #ifdef CONFIG_SCHED_CORE
 314         dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
 315 #endif
 316 }
 317
 318 static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
 319                                  struct cgroup_base_stat *src_bstat)
 320 {
 321         dst_bstat->cputime.utime -= src_bstat->cputime.utime;
 322         dst_bstat->cputime.stime -= src_bstat->cputime.stime;
 323         dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
 324 #ifdef CONFIG_SCHED_CORE
 325         dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
 326 #endif
 327 }
 328
 329 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
 330 {
 331         struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
 332         struct cgroup *parent = cgroup_parent(cgrp);
 333         struct cgroup_base_stat delta;
 334         unsigned seq;
 335
 336         /* Root-level stats are sourced from system-wide CPU stats */
 337         if (!parent)
 338                 return;
 339
 340         /* fetch the current per-cpu values */
 341         do {
 342                 seq = __u64_stats_fetch_begin(&rstatc->bsync);
 343                 delta = rstatc->bstat;
 344         } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
 345
 346         /* propagate percpu delta to global */
 347         cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
 348         cgroup_base_stat_add(&cgrp->bstat, &delta);
 349         cgroup_base_stat_add(&rstatc->last_bstat, &delta);
 350
 351         /* propagate global delta to parent (unless that's root) */
 352         if (cgroup_parent(parent)) {
 353                 delta = cgrp->bstat;
 354                 cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
 355                 cgroup_base_stat_add(&parent->bstat, &delta);
 356                 cgroup_base_stat_add(&cgrp->last_bstat, &delta);
 357         }
 358 }
 359
 360 static struct cgroup_rstat_cpu *
 361 cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
 362 {
 363         struct cgroup_rstat_cpu *rstatc;
 364
 365         rstatc = get_cpu_ptr(cgrp->rstat_cpu);
 366         *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
 367         return rstatc;
 368 }
 369
 370 static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
 371                                                  struct cgroup_rstat_cpu *rstatc,
 372                                                  unsigned long flags)
 373 {
 374         u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
 375         cgroup_rstat_updated(cgrp, smp_processor_id());
 376         put_cpu_ptr(rstatc);
 377 }
 378
 379 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
 380 {
 381         struct cgroup_rstat_cpu *rstatc;
 382         unsigned long flags;
 383
 384         rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
 385         rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
 386         cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
 387 }
 388
 389 void __cgroup_account_cputime_field(struct cgroup *cgrp,
 390                                     enum cpu_usage_stat index, u64 delta_exec)
 391 {
 392         struct cgroup_rstat_cpu *rstatc;
 393         unsigned long flags;
 394
 395         rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
 396
 397         switch (index) {
 398         case CPUTIME_USER:
 399         case CPUTIME_NICE:
 400                 rstatc->bstat.cputime.utime += delta_exec;
 401                 break;
 402         case CPUTIME_SYSTEM:
 403         case CPUTIME_IRQ:
 404         case CPUTIME_SOFTIRQ:
 405                 rstatc->bstat.cputime.stime += delta_exec;
 406                 break;
 407 #ifdef CONFIG_SCHED_CORE
 408         case CPUTIME_FORCEIDLE:
 409                 rstatc->bstat.forceidle_sum += delta_exec;
 410                 break;
 411 #endif
 412         default:
 413                 break;
 414         }
 415
 416         cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
 417 }
 418
 419 /*
 420  * compute the cputime for the root cgroup by getting the per cpu data
 421  * at a global level, then categorizing the fields in a manner consistent
 422  * with how it is done by __cgroup_account_cputime_field for each bit of
 423  * cpu time attributed to a cgroup.
 424  */
 425 static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
 426 {
 427         struct task_cputime *cputime = &bstat->cputime;
 428         int i;
 429
 430         cputime->stime = 0;
 431         cputime->utime = 0;
 432         cputime->sum_exec_runtime = 0;
 433         for_each_possible_cpu(i) {
 434                 struct kernel_cpustat kcpustat;
 435                 u64 *cpustat = kcpustat.cpustat;
 436                 u64 user = 0;
 437                 u64 sys = 0;
 438
 439                 kcpustat_cpu_fetch(&kcpustat, i);
 440
 441                 user += cpustat[CPUTIME_USER];
 442                 user += cpustat[CPUTIME_NICE];
 443                 cputime->utime += user;
 444
 445                 sys += cpustat[CPUTIME_SYSTEM];
 446                 sys += cpustat[CPUTIME_IRQ];
 447                 sys += cpustat[CPUTIME_SOFTIRQ];
 448                 cputime->stime += sys;
 449
 450                 cputime->sum_exec_runtime += user;
 451                 cputime->sum_exec_runtime += sys;
 452                 cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
 453
 454 #ifdef CONFIG_SCHED_CORE
 455                 bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
 456 #endif
 457         }
 458 }
 459
 460 void cgroup_base_stat_cputime_show(struct seq_file *seq)
 461 {
 462         struct cgroup *cgrp = seq_css(seq)->cgroup;
 463         u64 usage, utime, stime;
 464         struct cgroup_base_stat bstat;
 465 #ifdef CONFIG_SCHED_CORE
 466         u64 forceidle_time;
 467 #endif
 468
 469         if (cgroup_parent(cgrp)) {
 470                 cgroup_rstat_flush_hold(cgrp);
 471                 usage = cgrp->bstat.cputime.sum_exec_runtime;
 472                 cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
 473                                &utime, &stime);
 474 #ifdef CONFIG_SCHED_CORE
 475                 forceidle_time = cgrp->bstat.forceidle_sum;
 476 #endif
 477                 cgroup_rstat_flush_release();
 478         } else {
 479                 root_cgroup_cputime(&bstat);
 480                 usage = bstat.cputime.sum_exec_runtime;
 481                 utime = bstat.cputime.utime;
 482                 stime = bstat.cputime.stime;
 483 #ifdef CONFIG_SCHED_CORE
 484                 forceidle_time = bstat.forceidle_sum;
 485 #endif
 486         }
 487
 488         do_div(usage, NSEC_PER_USEC);
 489         do_div(utime, NSEC_PER_USEC);
 490         do_div(stime, NSEC_PER_USEC);
 491 #ifdef CONFIG_SCHED_CORE
 492         do_div(forceidle_time, NSEC_PER_USEC);
 493 #endif
 494
 495         seq_printf(seq, "usage_usec %llu\n"
 496                    "user_usec %llu\n"
 497                    "system_usec %llu\n",
 498                    usage, utime, stime);
 499
 500 #ifdef CONFIG_SCHED_CORE
 501         seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
 502 #endif
 503 }