kernel/profile.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  *  linux/kernel/profile.c
   4  *  Simple profiling. Manages a direct-mapped profile hit count buffer,
   5  *  with configurable resolution, support for restricting the cpus on
   6  *  which profiling is done, and switching between cpu time and
   7  *  schedule() calls via kernel command line parameters passed at boot.
   8  *
   9  *  Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
  10  *      Red Hat, July 2004
  11  *  Consolidation of architecture support code for profiling,
  12  *      Nadia Yvette Chambers, Oracle, July 2004
  13  *  Amortized hit count accounting via per-cpu open-addressed hashtables
  14  *      to resolve timer interrupt livelocks, Nadia Yvette Chambers,
  15  *      Oracle, 2004
  16  */
  17
  18 #include <linux/export.h>
  19 #include <linux/profile.h>
  20 #include <linux/memblock.h>
  21 #include <linux/notifier.h>
  22 #include <linux/mm.h>
  23 #include <linux/cpumask.h>
  24 #include <linux/cpu.h>
  25 #include <linux/highmem.h>
  26 #include <linux/mutex.h>
  27 #include <linux/slab.h>
  28 #include <linux/vmalloc.h>
  29 #include <linux/sched/stat.h>
  30
  31 #include <asm/sections.h>
  32 #include <asm/irq_regs.h>
  33 #include <asm/ptrace.h>
  34
  35 struct profile_hit {
  36         u32 pc, hits;
  37 };
  38 #define PROFILE_GRPSHIFT        3
  39 #define PROFILE_GRPSZ           (1 << PROFILE_GRPSHIFT)
  40 #define NR_PROFILE_HIT          (PAGE_SIZE/sizeof(struct profile_hit))
  41 #define NR_PROFILE_GRP          (NR_PROFILE_HIT/PROFILE_GRPSZ)
  42
  43 static atomic_t *prof_buffer;
  44 static unsigned long prof_len, prof_shift;
  45
  46 int prof_on __read_mostly;
  47 EXPORT_SYMBOL_GPL(prof_on);
  48
  49 static cpumask_var_t prof_cpu_mask;
  50 #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
  51 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
  52 static DEFINE_PER_CPU(int, cpu_profile_flip);
  53 static DEFINE_MUTEX(profile_flip_mutex);
  54 #endif /* CONFIG_SMP */
  55
  56 int profile_setup(char *str)
  57 {
  58         static const char schedstr[] = "schedule";
  59         static const char sleepstr[] = "sleep";
  60         static const char kvmstr[] = "kvm";
  61         int par;
  62
  63         if (!strncmp(str, sleepstr, strlen(sleepstr))) {
  64 #ifdef CONFIG_SCHEDSTATS
  65                 force_schedstat_enabled();
  66                 prof_on = SLEEP_PROFILING;
  67                 if (str[strlen(sleepstr)] == ',')
  68                         str += strlen(sleepstr) + 1;
  69                 if (get_option(&str, &par))
  70                         prof_shift = par;
  71                 pr_info("kernel sleep profiling enabled (shift: %ld)\n",
  72                         prof_shift);
  73 #else
  74                 pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
  75 #endif /* CONFIG_SCHEDSTATS */
  76         } else if (!strncmp(str, schedstr, strlen(schedstr))) {
  77                 prof_on = SCHED_PROFILING;
  78                 if (str[strlen(schedstr)] == ',')
  79                         str += strlen(schedstr) + 1;
  80                 if (get_option(&str, &par))
  81                         prof_shift = par;
  82                 pr_info("kernel schedule profiling enabled (shift: %ld)\n",
  83                         prof_shift);
  84         } else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
  85                 prof_on = KVM_PROFILING;
  86                 if (str[strlen(kvmstr)] == ',')
  87                         str += strlen(kvmstr) + 1;
  88                 if (get_option(&str, &par))
  89                         prof_shift = par;
  90                 pr_info("kernel KVM profiling enabled (shift: %ld)\n",
  91                         prof_shift);
  92         } else if (get_option(&str, &par)) {
  93                 prof_shift = par;
  94                 prof_on = CPU_PROFILING;
  95                 pr_info("kernel profiling enabled (shift: %ld)\n",
  96                         prof_shift);
  97         }
  98         return 1;
  99 }
 100 __setup("profile=", profile_setup);
 101
 102
 103 int __ref profile_init(void)
 104 {
 105         int buffer_bytes;
 106         if (!prof_on)
 107                 return 0;
 108
 109         /* only text is profiled */
 110         prof_len = (_etext - _stext) >> prof_shift;
 111         buffer_bytes = prof_len*sizeof(atomic_t);
 112
 113         if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
 114                 return -ENOMEM;
 115
 116         cpumask_copy(prof_cpu_mask, cpu_possible_mask);
 117
 118         prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
 119         if (prof_buffer)
 120                 return 0;
 121
 122         prof_buffer = alloc_pages_exact(buffer_bytes,
 123                                         GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
 124         if (prof_buffer)
 125                 return 0;
 126
 127         prof_buffer = vzalloc(buffer_bytes);
 128         if (prof_buffer)
 129                 return 0;
 130
 131         free_cpumask_var(prof_cpu_mask);
 132         return -ENOMEM;
 133 }
 134
 135 /* Profile event notifications */
 136
 137 static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
 138 static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
 139 static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
 140
 141 void profile_task_exit(struct task_struct *task)
 142 {
 143         blocking_notifier_call_chain(&task_exit_notifier, 0, task);
 144 }
 145
 146 int profile_handoff_task(struct task_struct *task)
 147 {
 148         int ret;
 149         ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
 150         return (ret == NOTIFY_OK) ? 1 : 0;
 151 }
 152
 153 void profile_munmap(unsigned long addr)
 154 {
 155         blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
 156 }
 157
 158 int task_handoff_register(struct notifier_block *n)
 159 {
 160         return atomic_notifier_chain_register(&task_free_notifier, n);
 161 }
 162 EXPORT_SYMBOL_GPL(task_handoff_register);
 163
 164 int task_handoff_unregister(struct notifier_block *n)
 165 {
 166         return atomic_notifier_chain_unregister(&task_free_notifier, n);
 167 }
 168 EXPORT_SYMBOL_GPL(task_handoff_unregister);
 169
 170 int profile_event_register(enum profile_type type, struct notifier_block *n)
 171 {
 172         int err = -EINVAL;
 173
 174         switch (type) {
 175         case PROFILE_TASK_EXIT:
 176                 err = blocking_notifier_chain_register(
 177                                 &task_exit_notifier, n);
 178                 break;
 179         case PROFILE_MUNMAP:
 180                 err = blocking_notifier_chain_register(
 181                                 &munmap_notifier, n);
 182                 break;
 183         }
 184
 185         return err;
 186 }
 187 EXPORT_SYMBOL_GPL(profile_event_register);
 188
 189 int profile_event_unregister(enum profile_type type, struct notifier_block *n)
 190 {
 191         int err = -EINVAL;
 192
 193         switch (type) {
 194         case PROFILE_TASK_EXIT:
 195                 err = blocking_notifier_chain_unregister(
 196                                 &task_exit_notifier, n);
 197                 break;
 198         case PROFILE_MUNMAP:
 199                 err = blocking_notifier_chain_unregister(
 200                                 &munmap_notifier, n);
 201                 break;
 202         }
 203
 204         return err;
 205 }
 206 EXPORT_SYMBOL_GPL(profile_event_unregister);
 207
 208 #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
 209 /*
 210  * Each cpu has a pair of open-addressed hashtables for pending
 211  * profile hits. read_profile() IPI's all cpus to request them
 212  * to flip buffers and flushes their contents to prof_buffer itself.
 213  * Flip requests are serialized by the profile_flip_mutex. The sole
 214  * use of having a second hashtable is for avoiding cacheline
 215  * contention that would otherwise happen during flushes of pending
 216  * profile hits required for the accuracy of reported profile hits
 217  * and so resurrect the interrupt livelock issue.
 218  *
 219  * The open-addressed hashtables are indexed by profile buffer slot
 220  * and hold the number of pending hits to that profile buffer slot on
 221  * a cpu in an entry. When the hashtable overflows, all pending hits
 222  * are accounted to their corresponding profile buffer slots with
 223  * atomic_add() and the hashtable emptied. As numerous pending hits
 224  * may be accounted to a profile buffer slot in a hashtable entry,
 225  * this amortizes a number of atomic profile buffer increments likely
 226  * to be far larger than the number of entries in the hashtable,
 227  * particularly given that the number of distinct profile buffer
 228  * positions to which hits are accounted during short intervals (e.g.
 229  * several seconds) is usually very small. Exclusion from buffer
 230  * flipping is provided by interrupt disablement (note that for
 231  * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
 232  * process context).
 233  * The hash function is meant to be lightweight as opposed to strong,
 234  * and was vaguely inspired by ppc64 firmware-supported inverted
 235  * pagetable hash functions, but uses a full hashtable full of finite
 236  * collision chains, not just pairs of them.
 237  *
 238  * -- nyc
 239  */
 240 static void __profile_flip_buffers(void *unused)
 241 {
 242         int cpu = smp_processor_id();
 243
 244         per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu);
 245 }
 246
 247 static void profile_flip_buffers(void)
 248 {
 249         int i, j, cpu;
 250
 251         mutex_lock(&profile_flip_mutex);
 252         j = per_cpu(cpu_profile_flip, get_cpu());
 253         put_cpu();
 254         on_each_cpu(__profile_flip_buffers, NULL, 1);
 255         for_each_online_cpu(cpu) {
 256                 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
 257                 for (i = 0; i < NR_PROFILE_HIT; ++i) {
 258                         if (!hits[i].hits) {
 259                                 if (hits[i].pc)
 260                                         hits[i].pc = 0;
 261                                 continue;
 262                         }
 263                         atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
 264                         hits[i].hits = hits[i].pc = 0;
 265                 }
 266         }
 267         mutex_unlock(&profile_flip_mutex);
 268 }
 269
 270 static void profile_discard_flip_buffers(void)
 271 {
 272         int i, cpu;
 273
 274         mutex_lock(&profile_flip_mutex);
 275         i = per_cpu(cpu_profile_flip, get_cpu());
 276         put_cpu();
 277         on_each_cpu(__profile_flip_buffers, NULL, 1);
 278         for_each_online_cpu(cpu) {
 279                 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
 280                 memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
 281         }
 282         mutex_unlock(&profile_flip_mutex);
 283 }
 284
 285 static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
 286 {
 287         unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
 288         int i, j, cpu;
 289         struct profile_hit *hits;
 290
 291         pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
 292         i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
 293         secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
 294         cpu = get_cpu();
 295         hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)];
 296         if (!hits) {
 297                 put_cpu();
 298                 return;
 299         }
 300         /*
 301          * We buffer the global profiler buffer into a per-CPU
 302          * queue and thus reduce the number of global (and possibly
 303          * NUMA-alien) accesses. The write-queue is self-coalescing:
 304          */
 305         local_irq_save(flags);
 306         do {
 307                 for (j = 0; j < PROFILE_GRPSZ; ++j) {
 308                         if (hits[i + j].pc == pc) {
 309                                 hits[i + j].hits += nr_hits;
 310                                 goto out;
 311                         } else if (!hits[i + j].hits) {
 312                                 hits[i + j].pc = pc;
 313                                 hits[i + j].hits = nr_hits;
 314                                 goto out;
 315                         }
 316                 }
 317                 i = (i + secondary) & (NR_PROFILE_HIT - 1);
 318         } while (i != primary);
 319
 320         /*
 321          * Add the current hit(s) and flush the write-queue out
 322          * to the global buffer:
 323          */
 324         atomic_add(nr_hits, &prof_buffer[pc]);
 325         for (i = 0; i < NR_PROFILE_HIT; ++i) {
 326                 atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
 327                 hits[i].pc = hits[i].hits = 0;
 328         }
 329 out:
 330         local_irq_restore(flags);
 331         put_cpu();
 332 }
 333
 334 static int profile_dead_cpu(unsigned int cpu)
 335 {
 336         struct page *page;
 337         int i;
 338
 339         if (cpumask_available(prof_cpu_mask))
 340                 cpumask_clear_cpu(cpu, prof_cpu_mask);
 341
 342         for (i = 0; i < 2; i++) {
 343                 if (per_cpu(cpu_profile_hits, cpu)[i]) {
 344                         page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]);
 345                         per_cpu(cpu_profile_hits, cpu)[i] = NULL;
 346                         __free_page(page);
 347                 }
 348         }
 349         return 0;
 350 }
 351
 352 static int profile_prepare_cpu(unsigned int cpu)
 353 {
 354         int i, node = cpu_to_mem(cpu);
 355         struct page *page;
 356
 357         per_cpu(cpu_profile_flip, cpu) = 0;
 358
 359         for (i = 0; i < 2; i++) {
 360                 if (per_cpu(cpu_profile_hits, cpu)[i])
 361                         continue;
 362
 363                 page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
 364                 if (!page) {
 365                         profile_dead_cpu(cpu);
 366                         return -ENOMEM;
 367                 }
 368                 per_cpu(cpu_profile_hits, cpu)[i] = page_address(page);
 369
 370         }
 371         return 0;
 372 }
 373
 374 static int profile_online_cpu(unsigned int cpu)
 375 {
 376         if (cpumask_available(prof_cpu_mask))
 377                 cpumask_set_cpu(cpu, prof_cpu_mask);
 378
 379         return 0;
 380 }
 381
 382 #else /* !CONFIG_SMP */
 383 #define profile_flip_buffers()          do { } while (0)
 384 #define profile_discard_flip_buffers()  do { } while (0)
 385
 386 static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
 387 {
 388         unsigned long pc;
 389         pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
 390         atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
 391 }
 392 #endif /* !CONFIG_SMP */
 393
 394 void profile_hits(int type, void *__pc, unsigned int nr_hits)
 395 {
 396         if (prof_on != type || !prof_buffer)
 397                 return;
 398         do_profile_hits(type, __pc, nr_hits);
 399 }
 400 EXPORT_SYMBOL_GPL(profile_hits);
 401
 402 void profile_tick(int type)
 403 {
 404         struct pt_regs *regs = get_irq_regs();
 405
 406         if (!user_mode(regs) && cpumask_available(prof_cpu_mask) &&
 407             cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
 408                 profile_hit(type, (void *)profile_pc(regs));
 409 }
 410
 411 #ifdef CONFIG_PROC_FS
 412 #include <linux/proc_fs.h>
 413 #include <linux/seq_file.h>
 414 #include <linux/uaccess.h>
 415
 416 static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
 417 {
 418         seq_printf(m, "%*pb\n", cpumask_pr_args(prof_cpu_mask));
 419         return 0;
 420 }
 421
 422 static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
 423 {
 424         return single_open(file, prof_cpu_mask_proc_show, NULL);
 425 }
 426
 427 static ssize_t prof_cpu_mask_proc_write(struct file *file,
 428         const char __user *buffer, size_t count, loff_t *pos)
 429 {
 430         cpumask_var_t new_value;
 431         int err;
 432
 433         if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
 434                 return -ENOMEM;
 435
 436         err = cpumask_parse_user(buffer, count, new_value);
 437         if (!err) {
 438                 cpumask_copy(prof_cpu_mask, new_value);
 439                 err = count;
 440         }
 441         free_cpumask_var(new_value);
 442         return err;
 443 }
 444
 445 static const struct proc_ops prof_cpu_mask_proc_ops = {
 446         .proc_open      = prof_cpu_mask_proc_open,
 447         .proc_read      = seq_read,
 448         .proc_lseek     = seq_lseek,
 449         .proc_release   = single_release,
 450         .proc_write     = prof_cpu_mask_proc_write,
 451 };
 452
 453 void create_prof_cpu_mask(void)
 454 {
 455         /* create /proc/irq/prof_cpu_mask */
 456         proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_ops);
 457 }
 458
 459 /*
 460  * This function accesses profiling information. The returned data is
 461  * binary: the sampling step and the actual contents of the profile
 462  * buffer. Use of the program readprofile is recommended in order to
 463  * get meaningful info out of these data.
 464  */
 465 static ssize_t
 466 read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 467 {
 468         unsigned long p = *ppos;
 469         ssize_t read;
 470         char *pnt;
 471         unsigned int sample_step = 1 << prof_shift;
 472
 473         profile_flip_buffers();
 474         if (p >= (prof_len+1)*sizeof(unsigned int))
 475                 return 0;
 476         if (count > (prof_len+1)*sizeof(unsigned int) - p)
 477                 count = (prof_len+1)*sizeof(unsigned int) - p;
 478         read = 0;
 479
 480         while (p < sizeof(unsigned int) && count > 0) {
 481                 if (put_user(*((char *)(&sample_step)+p), buf))
 482                         return -EFAULT;
 483                 buf++; p++; count--; read++;
 484         }
 485         pnt = (char *)prof_buffer + p - sizeof(atomic_t);
 486         if (copy_to_user(buf, (void *)pnt, count))
 487                 return -EFAULT;
 488         read += count;
 489         *ppos += read;
 490         return read;
 491 }
 492
 493 /*
 494  * Writing to /proc/profile resets the counters
 495  *
 496  * Writing a 'profiling multiplier' value into it also re-sets the profiling
 497  * interrupt frequency, on architectures that support this.
 498  */
 499 static ssize_t write_profile(struct file *file, const char __user *buf,
 500                              size_t count, loff_t *ppos)
 501 {
 502 #ifdef CONFIG_SMP
 503         extern int setup_profiling_timer(unsigned int multiplier);
 504
 505         if (count == sizeof(int)) {
 506                 unsigned int multiplier;
 507
 508                 if (copy_from_user(&multiplier, buf, sizeof(int)))
 509                         return -EFAULT;
 510
 511                 if (setup_profiling_timer(multiplier))
 512                         return -EINVAL;
 513         }
 514 #endif
 515         profile_discard_flip_buffers();
 516         memset(prof_buffer, 0, prof_len * sizeof(atomic_t));
 517         return count;
 518 }
 519
 520 static const struct proc_ops profile_proc_ops = {
 521         .proc_read      = read_profile,
 522         .proc_write     = write_profile,
 523         .proc_lseek     = default_llseek,
 524 };
 525
 526 int __ref create_proc_profile(void)
 527 {
 528         struct proc_dir_entry *entry;
 529 #ifdef CONFIG_SMP
 530         enum cpuhp_state online_state;
 531 #endif
 532
 533         int err = 0;
 534
 535         if (!prof_on)
 536                 return 0;
 537 #ifdef CONFIG_SMP
 538         err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE",
 539                                 profile_prepare_cpu, profile_dead_cpu);
 540         if (err)
 541                 return err;
 542
 543         err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE",
 544                                 profile_online_cpu, NULL);
 545         if (err < 0)
 546                 goto err_state_prep;
 547         online_state = err;
 548         err = 0;
 549 #endif
 550         entry = proc_create("profile", S_IWUSR | S_IRUGO,
 551                             NULL, &profile_proc_ops);
 552         if (!entry)
 553                 goto err_state_onl;
 554         proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
 555
 556         return err;
 557 err_state_onl:
 558 #ifdef CONFIG_SMP
 559         cpuhp_remove_state(online_state);
 560 err_state_prep:
 561         cpuhp_remove_state(CPUHP_PROFILE_PREPARE);
 562 #endif
 563         return err;
 564 }
 565 subsys_initcall(create_proc_profile);
 566 #endif /* CONFIG_PROC_FS */