kernel/events/core.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Performance events core code:
   4  *
   5  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   6  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
   7  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
   8  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   9  */
  10
  11 #include <linux/fs.h>
  12 #include <linux/mm.h>
  13 #include <linux/cpu.h>
  14 #include <linux/smp.h>
  15 #include <linux/idr.h>
  16 #include <linux/file.h>
  17 #include <linux/poll.h>
  18 #include <linux/slab.h>
  19 #include <linux/hash.h>
  20 #include <linux/tick.h>
  21 #include <linux/sysfs.h>
  22 #include <linux/dcache.h>
  23 #include <linux/percpu.h>
  24 #include <linux/ptrace.h>
  25 #include <linux/reboot.h>
  26 #include <linux/vmstat.h>
  27 #include <linux/device.h>
  28 #include <linux/export.h>
  29 #include <linux/vmalloc.h>
  30 #include <linux/hardirq.h>
  31 #include <linux/rculist.h>
  32 #include <linux/uaccess.h>
  33 #include <linux/syscalls.h>
  34 #include <linux/anon_inodes.h>
  35 #include <linux/kernel_stat.h>
  36 #include <linux/cgroup.h>
  37 #include <linux/perf_event.h>
  38 #include <linux/trace_events.h>
  39 #include <linux/hw_breakpoint.h>
  40 #include <linux/mm_types.h>
  41 #include <linux/module.h>
  42 #include <linux/mman.h>
  43 #include <linux/compat.h>
  44 #include <linux/bpf.h>
  45 #include <linux/filter.h>
  46 #include <linux/namei.h>
  47 #include <linux/parser.h>
  48 #include <linux/sched/clock.h>
  49 #include <linux/sched/mm.h>
  50 #include <linux/proc_ns.h>
  51 #include <linux/mount.h>
  52 #include <linux/min_heap.h>
  53
  54 #include "internal.h"
  55
  56 #include <asm/irq_regs.h>
  57
  58 typedef int (*remote_function_f)(void *);
  59
  60 struct remote_function_call {
  61         struct task_struct      *p;
  62         remote_function_f       func;
  63         void                    *info;
  64         int                     ret;
  65 };
  66
  67 static void remote_function(void *data)
  68 {
  69         struct remote_function_call *tfc = data;
  70         struct task_struct *p = tfc->p;
  71
  72         if (p) {
  73                 /* -EAGAIN */
  74                 if (task_cpu(p) != smp_processor_id())
  75                         return;
  76
  77                 /*
  78                  * Now that we're on right CPU with IRQs disabled, we can test
  79                  * if we hit the right task without races.
  80                  */
  81
  82                 tfc->ret = -ESRCH; /* No such (running) process */
  83                 if (p != current)
  84                         return;
  85         }
  86
  87         tfc->ret = tfc->func(tfc->info);
  88 }
  89
  90 /**
  91  * task_function_call - call a function on the cpu on which a task runs
  92  * @p:          the task to evaluate
  93  * @func:       the function to be called
  94  * @info:       the function call argument
  95  *
  96  * Calls the function @func when the task is currently running. This might
  97  * be on the current CPU, which just calls the function directly
  98  *
  99  * returns: @func return value, or
 100  *          -ESRCH  - when the process isn't running
 101  *          -EAGAIN - when the process moved away
 102  */
 103 static int
 104 task_function_call(struct task_struct *p, remote_function_f func, void *info)
 105 {
 106         struct remote_function_call data = {
 107                 .p      = p,
 108                 .func   = func,
 109                 .info   = info,
 110                 .ret    = -EAGAIN,
 111         };
 112         int ret;
 113
 114         do {
 115                 ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
 116                 if (!ret)
 117                         ret = data.ret;
 118         } while (ret == -EAGAIN);
 119
 120         return ret;
 121 }
 122
 123 /**
 124  * cpu_function_call - call a function on the cpu
 125  * @func:       the function to be called
 126  * @info:       the function call argument
 127  *
 128  * Calls the function @func on the remote cpu.
 129  *
 130  * returns: @func return value or -ENXIO when the cpu is offline
 131  */
 132 static int cpu_function_call(int cpu, remote_function_f func, void *info)
 133 {
 134         struct remote_function_call data = {
 135                 .p      = NULL,
 136                 .func   = func,
 137                 .info   = info,
 138                 .ret    = -ENXIO, /* No such CPU */
 139         };
 140
 141         smp_call_function_single(cpu, remote_function, &data, 1);
 142
 143         return data.ret;
 144 }
 145
 146 static inline struct perf_cpu_context *
 147 __get_cpu_context(struct perf_event_context *ctx)
 148 {
 149         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
 150 }
 151
 152 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
 153                           struct perf_event_context *ctx)
 154 {
 155         raw_spin_lock(&cpuctx->ctx.lock);
 156         if (ctx)
 157                 raw_spin_lock(&ctx->lock);
 158 }
 159
 160 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 161                             struct perf_event_context *ctx)
 162 {
 163         if (ctx)
 164                 raw_spin_unlock(&ctx->lock);
 165         raw_spin_unlock(&cpuctx->ctx.lock);
 166 }
 167
 168 #define TASK_TOMBSTONE ((void *)-1L)
 169
 170 static bool is_kernel_event(struct perf_event *event)
 171 {
 172         return READ_ONCE(event->owner) == TASK_TOMBSTONE;
 173 }
 174
 175 /*
 176  * On task ctx scheduling...
 177  *
 178  * When !ctx->nr_events a task context will not be scheduled. This means
 179  * we can disable the scheduler hooks (for performance) without leaving
 180  * pending task ctx state.
 181  *
 182  * This however results in two special cases:
 183  *
 184  *  - removing the last event from a task ctx; this is relatively straight
 185  *    forward and is done in __perf_remove_from_context.
 186  *
 187  *  - adding the first event to a task ctx; this is tricky because we cannot
 188  *    rely on ctx->is_active and therefore cannot use event_function_call().
 189  *    See perf_install_in_context().
 190  *
 191  * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
 192  */
 193
 194 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
 195                         struct perf_event_context *, void *);
 196
 197 struct event_function_struct {
 198         struct perf_event *event;
 199         event_f func;
 200         void *data;
 201 };
 202
 203 static int event_function(void *info)
 204 {
 205         struct event_function_struct *efs = info;
 206         struct perf_event *event = efs->event;
 207         struct perf_event_context *ctx = event->ctx;
 208         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 209         struct perf_event_context *task_ctx = cpuctx->task_ctx;
 210         int ret = 0;
 211
 212         lockdep_assert_irqs_disabled();
 213
 214         perf_ctx_lock(cpuctx, task_ctx);
 215         /*
 216          * Since we do the IPI call without holding ctx->lock things can have
 217          * changed, double check we hit the task we set out to hit.
 218          */
 219         if (ctx->task) {
 220                 if (ctx->task != current) {
 221                         ret = -ESRCH;
 222                         goto unlock;
 223                 }
 224
 225                 /*
 226                  * We only use event_function_call() on established contexts,
 227                  * and event_function() is only ever called when active (or
 228                  * rather, we'll have bailed in task_function_call() or the
 229                  * above ctx->task != current test), therefore we must have
 230                  * ctx->is_active here.
 231                  */
 232                 WARN_ON_ONCE(!ctx->is_active);
 233                 /*
 234                  * And since we have ctx->is_active, cpuctx->task_ctx must
 235                  * match.
 236                  */
 237                 WARN_ON_ONCE(task_ctx != ctx);
 238         } else {
 239                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
 240         }
 241
 242         efs->func(event, cpuctx, ctx, efs->data);
 243 unlock:
 244         perf_ctx_unlock(cpuctx, task_ctx);
 245
 246         return ret;
 247 }
 248
 249 static void event_function_call(struct perf_event *event, event_f func, void *data)
 250 {
 251         struct perf_event_context *ctx = event->ctx;
 252         struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
 253         struct event_function_struct efs = {
 254                 .event = event,
 255                 .func = func,
 256                 .data = data,
 257         };
 258
 259         if (!event->parent) {
 260                 /*
 261                  * If this is a !child event, we must hold ctx::mutex to
 262                  * stabilize the the event->ctx relation. See
 263                  * perf_event_ctx_lock().
 264                  */
 265                 lockdep_assert_held(&ctx->mutex);
 266         }
 267
 268         if (!task) {
 269                 cpu_function_call(event->cpu, event_function, &efs);
 270                 return;
 271         }
 272
 273         if (task == TASK_TOMBSTONE)
 274                 return;
 275
 276 again:
 277         if (!task_function_call(task, event_function, &efs))
 278                 return;
 279
 280         raw_spin_lock_irq(&ctx->lock);
 281         /*
 282          * Reload the task pointer, it might have been changed by
 283          * a concurrent perf_event_context_sched_out().
 284          */
 285         task = ctx->task;
 286         if (task == TASK_TOMBSTONE) {
 287                 raw_spin_unlock_irq(&ctx->lock);
 288                 return;
 289         }
 290         if (ctx->is_active) {
 291                 raw_spin_unlock_irq(&ctx->lock);
 292                 goto again;
 293         }
 294         func(event, NULL, ctx, data);
 295         raw_spin_unlock_irq(&ctx->lock);
 296 }
 297
 298 /*
 299  * Similar to event_function_call() + event_function(), but hard assumes IRQs
 300  * are already disabled and we're on the right CPU.
 301  */
 302 static void event_function_local(struct perf_event *event, event_f func, void *data)
 303 {
 304         struct perf_event_context *ctx = event->ctx;
 305         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 306         struct task_struct *task = READ_ONCE(ctx->task);
 307         struct perf_event_context *task_ctx = NULL;
 308
 309         lockdep_assert_irqs_disabled();
 310
 311         if (task) {
 312                 if (task == TASK_TOMBSTONE)
 313                         return;
 314
 315                 task_ctx = ctx;
 316         }
 317
 318         perf_ctx_lock(cpuctx, task_ctx);
 319
 320         task = ctx->task;
 321         if (task == TASK_TOMBSTONE)
 322                 goto unlock;
 323
 324         if (task) {
 325                 /*
 326                  * We must be either inactive or active and the right task,
 327                  * otherwise we're screwed, since we cannot IPI to somewhere
 328                  * else.
 329                  */
 330                 if (ctx->is_active) {
 331                         if (WARN_ON_ONCE(task != current))
 332                                 goto unlock;
 333
 334                         if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
 335                                 goto unlock;
 336                 }
 337         } else {
 338                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
 339         }
 340
 341         func(event, cpuctx, ctx, data);
 342 unlock:
 343         perf_ctx_unlock(cpuctx, task_ctx);
 344 }
 345
 346 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
 347                        PERF_FLAG_FD_OUTPUT  |\
 348                        PERF_FLAG_PID_CGROUP |\
 349                        PERF_FLAG_FD_CLOEXEC)
 350
 351 /*
 352  * branch priv levels that need permission checks
 353  */
 354 #define PERF_SAMPLE_BRANCH_PERM_PLM \
 355         (PERF_SAMPLE_BRANCH_KERNEL |\
 356          PERF_SAMPLE_BRANCH_HV)
 357
 358 enum event_type_t {
 359         EVENT_FLEXIBLE = 0x1,
 360         EVENT_PINNED = 0x2,
 361         EVENT_TIME = 0x4,
 362         /* see ctx_resched() for details */
 363         EVENT_CPU = 0x8,
 364         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 365 };
 366
 367 /*
 368  * perf_sched_events : >0 events exist
 369  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 370  */
 371
 372 static void perf_sched_delayed(struct work_struct *work);
 373 DEFINE_STATIC_KEY_FALSE(perf_sched_events);
 374 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
 375 static DEFINE_MUTEX(perf_sched_mutex);
 376 static atomic_t perf_sched_count;
 377
 378 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 379 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 380 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 381
 382 static atomic_t nr_mmap_events __read_mostly;
 383 static atomic_t nr_comm_events __read_mostly;
 384 static atomic_t nr_namespaces_events __read_mostly;
 385 static atomic_t nr_task_events __read_mostly;
 386 static atomic_t nr_freq_events __read_mostly;
 387 static atomic_t nr_switch_events __read_mostly;
 388 static atomic_t nr_ksymbol_events __read_mostly;
 389 static atomic_t nr_bpf_events __read_mostly;
 390 static atomic_t nr_cgroup_events __read_mostly;
 391
 392 static LIST_HEAD(pmus);
 393 static DEFINE_MUTEX(pmus_lock);
 394 static struct srcu_struct pmus_srcu;
 395 static cpumask_var_t perf_online_mask;
 396
 397 /*
 398  * perf event paranoia level:
 399  *  -1 - not paranoid at all
 400  *   0 - disallow raw tracepoint access for unpriv
 401  *   1 - disallow cpu events for unpriv
 402  *   2 - disallow kernel profiling for unpriv
 403  */
 404 int sysctl_perf_event_paranoid __read_mostly = 2;
 405
 406 /* Minimum for 512 kiB + 1 user control page */
 407 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
 408
 409 /*
 410  * max perf event sample rate
 411  */
 412 #define DEFAULT_MAX_SAMPLE_RATE         100000
 413 #define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
 414 #define DEFAULT_CPU_TIME_MAX_PERCENT    25
 415
 416 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
 417
 418 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 419 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
 420
 421 static int perf_sample_allowed_ns __read_mostly =
 422         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
 423
 424 static void update_perf_cpu_limits(void)
 425 {
 426         u64 tmp = perf_sample_period_ns;
 427
 428         tmp *= sysctl_perf_cpu_time_max_percent;
 429         tmp = div_u64(tmp, 100);
 430         if (!tmp)
 431                 tmp = 1;
 432
 433         WRITE_ONCE(perf_sample_allowed_ns, tmp);
 434 }
 435
 436 static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
 437
 438 int perf_proc_update_handler(struct ctl_table *table, int write,
 439                 void __user *buffer, size_t *lenp,
 440                 loff_t *ppos)
 441 {
 442         int ret;
 443         int perf_cpu = sysctl_perf_cpu_time_max_percent;
 444         /*
 445          * If throttling is disabled don't allow the write:
 446          */
 447         if (write && (perf_cpu == 100 || perf_cpu == 0))
 448                 return -EINVAL;
 449
 450         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 451         if (ret || !write)
 452                 return ret;
 453
 454         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
 455         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 456         update_perf_cpu_limits();
 457
 458         return 0;
 459 }
 460
 461 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
 462
 463 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 464                                 void __user *buffer, size_t *lenp,
 465                                 loff_t *ppos)
 466 {
 467         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 468
 469         if (ret || !write)
 470                 return ret;
 471
 472         if (sysctl_perf_cpu_time_max_percent == 100 ||
 473             sysctl_perf_cpu_time_max_percent == 0) {
 474                 printk(KERN_WARNING
 475                        "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
 476                 WRITE_ONCE(perf_sample_allowed_ns, 0);
 477         } else {
 478                 update_perf_cpu_limits();
 479         }
 480
 481         return 0;
 482 }
 483
 484 /*
 485  * perf samples are done in some very critical code paths (NMIs).
 486  * If they take too much CPU time, the system can lock up and not
 487  * get any real work done.  This will drop the sample rate when
 488  * we detect that events are taking too long.
 489  */
 490 #define NR_ACCUMULATED_SAMPLES 128
 491 static DEFINE_PER_CPU(u64, running_sample_length);
 492
 493 static u64 __report_avg;
 494 static u64 __report_allowed;
 495
 496 static void perf_duration_warn(struct irq_work *w)
 497 {
 498         printk_ratelimited(KERN_INFO
 499                 "perf: interrupt took too long (%lld > %lld), lowering "
 500                 "kernel.perf_event_max_sample_rate to %d\n",
 501                 __report_avg, __report_allowed,
 502                 sysctl_perf_event_sample_rate);
 503 }
 504
 505 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
 506
 507 void perf_sample_event_took(u64 sample_len_ns)
 508 {
 509         u64 max_len = READ_ONCE(perf_sample_allowed_ns);
 510         u64 running_len;
 511         u64 avg_len;
 512         u32 max;
 513
 514         if (max_len == 0)
 515                 return;
 516
 517         /* Decay the counter by 1 average sample. */
 518         running_len = __this_cpu_read(running_sample_length);
 519         running_len -= running_len/NR_ACCUMULATED_SAMPLES;
 520         running_len += sample_len_ns;
 521         __this_cpu_write(running_sample_length, running_len);
 522
 523         /*
 524          * Note: this will be biased artifically low until we have
 525          * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
 526          * from having to maintain a count.
 527          */
 528         avg_len = running_len/NR_ACCUMULATED_SAMPLES;
 529         if (avg_len <= max_len)
 530                 return;
 531
 532         __report_avg = avg_len;
 533         __report_allowed = max_len;
 534
 535         /*
 536          * Compute a throttle threshold 25% below the current duration.
 537          */
 538         avg_len += avg_len / 4;
 539         max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
 540         if (avg_len < max)
 541                 max /= (u32)avg_len;
 542         else
 543                 max = 1;
 544
 545         WRITE_ONCE(perf_sample_allowed_ns, avg_len);
 546         WRITE_ONCE(max_samples_per_tick, max);
 547
 548         sysctl_perf_event_sample_rate = max * HZ;
 549         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 550
 551         if (!irq_work_queue(&perf_duration_work)) {
 552                 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
 553                              "kernel.perf_event_max_sample_rate to %d\n",
 554                              __report_avg, __report_allowed,
 555                              sysctl_perf_event_sample_rate);
 556         }
 557 }
 558
 559 static atomic64_t perf_event_id;
 560
 561 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 562                               enum event_type_t event_type);
 563
 564 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 565                              enum event_type_t event_type,
 566                              struct task_struct *task);
 567
 568 static void update_context_time(struct perf_event_context *ctx);
 569 static u64 perf_event_time(struct perf_event *event);
 570
 571 void __weak perf_event_print_debug(void)        { }
 572
 573 extern __weak const char *perf_pmu_name(void)
 574 {
 575         return "pmu";
 576 }
 577
 578 static inline u64 perf_clock(void)
 579 {
 580         return local_clock();
 581 }
 582
 583 static inline u64 perf_event_clock(struct perf_event *event)
 584 {
 585         return event->clock();
 586 }
 587
 588 /*
 589  * State based event timekeeping...
 590  *
 591  * The basic idea is to use event->state to determine which (if any) time
 592  * fields to increment with the current delta. This means we only need to
 593  * update timestamps when we change state or when they are explicitly requested
 594  * (read).
 595  *
 596  * Event groups make things a little more complicated, but not terribly so. The
 597  * rules for a group are that if the group leader is OFF the entire group is
 598  * OFF, irrespecive of what the group member states are. This results in
 599  * __perf_effective_state().
 600  *
 601  * A futher ramification is that when a group leader flips between OFF and
 602  * !OFF, we need to update all group member times.
 603  *
 604  *
 605  * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
 606  * need to make sure the relevant context time is updated before we try and
 607  * update our timestamps.
 608  */
 609
 610 static __always_inline enum perf_event_state
 611 __perf_effective_state(struct perf_event *event)
 612 {
 613         struct perf_event *leader = event->group_leader;
 614
 615         if (leader->state <= PERF_EVENT_STATE_OFF)
 616                 return leader->state;
 617
 618         return event->state;
 619 }
 620
 621 static __always_inline void
 622 __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
 623 {
 624         enum perf_event_state state = __perf_effective_state(event);
 625         u64 delta = now - event->tstamp;
 626
 627         *enabled = event->total_time_enabled;
 628         if (state >= PERF_EVENT_STATE_INACTIVE)
 629                 *enabled += delta;
 630
 631         *running = event->total_time_running;
 632         if (state >= PERF_EVENT_STATE_ACTIVE)
 633                 *running += delta;
 634 }
 635
 636 static void perf_event_update_time(struct perf_event *event)
 637 {
 638         u64 now = perf_event_time(event);
 639
 640         __perf_update_times(event, now, &event->total_time_enabled,
 641                                         &event->total_time_running);
 642         event->tstamp = now;
 643 }
 644
 645 static void perf_event_update_sibling_time(struct perf_event *leader)
 646 {
 647         struct perf_event *sibling;
 648
 649         for_each_sibling_event(sibling, leader)
 650                 perf_event_update_time(sibling);
 651 }
 652
 653 static void
 654 perf_event_set_state(struct perf_event *event, enum perf_event_state state)
 655 {
 656         if (event->state == state)
 657                 return;
 658
 659         perf_event_update_time(event);
 660         /*
 661          * If a group leader gets enabled/disabled all its siblings
 662          * are affected too.
 663          */
 664         if ((event->state < 0) ^ (state < 0))
 665                 perf_event_update_sibling_time(event);
 666
 667         WRITE_ONCE(event->state, state);
 668 }
 669
 670 #ifdef CONFIG_CGROUP_PERF
 671
 672 static inline bool
 673 perf_cgroup_match(struct perf_event *event)
 674 {
 675         struct perf_event_context *ctx = event->ctx;
 676         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 677
 678         /* @event doesn't care about cgroup */
 679         if (!event->cgrp)
 680                 return true;
 681
 682         /* wants specific cgroup scope but @cpuctx isn't associated with any */
 683         if (!cpuctx->cgrp)
 684                 return false;
 685
 686         /*
 687          * Cgroup scoping is recursive.  An event enabled for a cgroup is
 688          * also enabled for all its descendant cgroups.  If @cpuctx's
 689          * cgroup is a descendant of @event's (the test covers identity
 690          * case), it's a match.
 691          */
 692         return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
 693                                     event->cgrp->css.cgroup);
 694 }
 695
 696 static inline void perf_detach_cgroup(struct perf_event *event)
 697 {
 698         css_put(&event->cgrp->css);
 699         event->cgrp = NULL;
 700 }
 701
 702 static inline int is_cgroup_event(struct perf_event *event)
 703 {
 704         return event->cgrp != NULL;
 705 }
 706
 707 static inline u64 perf_cgroup_event_time(struct perf_event *event)
 708 {
 709         struct perf_cgroup_info *t;
 710
 711         t = per_cpu_ptr(event->cgrp->info, event->cpu);
 712         return t->time;
 713 }
 714
 715 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
 716 {
 717         struct perf_cgroup_info *info;
 718         u64 now;
 719
 720         now = perf_clock();
 721
 722         info = this_cpu_ptr(cgrp->info);
 723
 724         info->time += now - info->timestamp;
 725         info->timestamp = now;
 726 }
 727
 728 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 729 {
 730         struct perf_cgroup *cgrp = cpuctx->cgrp;
 731         struct cgroup_subsys_state *css;
 732
 733         if (cgrp) {
 734                 for (css = &cgrp->css; css; css = css->parent) {
 735                         cgrp = container_of(css, struct perf_cgroup, css);
 736                         __update_cgrp_time(cgrp);
 737                 }
 738         }
 739 }
 740
 741 static inline void update_cgrp_time_from_event(struct perf_event *event)
 742 {
 743         struct perf_cgroup *cgrp;
 744
 745         /*
 746          * ensure we access cgroup data only when needed and
 747          * when we know the cgroup is pinned (css_get)
 748          */
 749         if (!is_cgroup_event(event))
 750                 return;
 751
 752         cgrp = perf_cgroup_from_task(current, event->ctx);
 753         /*
 754          * Do not update time when cgroup is not active
 755          */
 756         if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
 757                 __update_cgrp_time(event->cgrp);
 758 }
 759
 760 static inline void
 761 perf_cgroup_set_timestamp(struct task_struct *task,
 762                           struct perf_event_context *ctx)
 763 {
 764         struct perf_cgroup *cgrp;
 765         struct perf_cgroup_info *info;
 766         struct cgroup_subsys_state *css;
 767
 768         /*
 769          * ctx->lock held by caller
 770          * ensure we do not access cgroup data
 771          * unless we have the cgroup pinned (css_get)
 772          */
 773         if (!task || !ctx->nr_cgroups)
 774                 return;
 775
 776         cgrp = perf_cgroup_from_task(task, ctx);
 777
 778         for (css = &cgrp->css; css; css = css->parent) {
 779                 cgrp = container_of(css, struct perf_cgroup, css);
 780                 info = this_cpu_ptr(cgrp->info);
 781                 info->timestamp = ctx->timestamp;
 782         }
 783 }
 784
 785 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
 786
 787 #define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
 788 #define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
 789
 790 /*
 791  * reschedule events based on the cgroup constraint of task.
 792  *
 793  * mode SWOUT : schedule out everything
 794  * mode SWIN : schedule in based on cgroup for next
 795  */
 796 static void perf_cgroup_switch(struct task_struct *task, int mode)
 797 {
 798         struct perf_cpu_context *cpuctx;
 799         struct list_head *list;
 800         unsigned long flags;
 801
 802         /*
 803          * Disable interrupts and preemption to avoid this CPU's
 804          * cgrp_cpuctx_entry to change under us.
 805          */
 806         local_irq_save(flags);
 807
 808         list = this_cpu_ptr(&cgrp_cpuctx_list);
 809         list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
 810                 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
 811
 812                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 813                 perf_pmu_disable(cpuctx->ctx.pmu);
 814
 815                 if (mode & PERF_CGROUP_SWOUT) {
 816                         cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 817                         /*
 818                          * must not be done before ctxswout due
 819                          * to event_filter_match() in event_sched_out()
 820                          */
 821                         cpuctx->cgrp = NULL;
 822                 }
 823
 824                 if (mode & PERF_CGROUP_SWIN) {
 825                         WARN_ON_ONCE(cpuctx->cgrp);
 826                         /*
 827                          * set cgrp before ctxsw in to allow
 828                          * event_filter_match() to not have to pass
 829                          * task around
 830                          * we pass the cpuctx->ctx to perf_cgroup_from_task()
 831                          * because cgorup events are only per-cpu
 832                          */
 833                         cpuctx->cgrp = perf_cgroup_from_task(task,
 834                                                              &cpuctx->ctx);
 835                         cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
 836                 }
 837                 perf_pmu_enable(cpuctx->ctx.pmu);
 838                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 839         }
 840
 841         local_irq_restore(flags);
 842 }
 843
 844 static inline void perf_cgroup_sched_out(struct task_struct *task,
 845                                          struct task_struct *next)
 846 {
 847         struct perf_cgroup *cgrp1;
 848         struct perf_cgroup *cgrp2 = NULL;
 849
 850         rcu_read_lock();
 851         /*
 852          * we come here when we know perf_cgroup_events > 0
 853          * we do not need to pass the ctx here because we know
 854          * we are holding the rcu lock
 855          */
 856         cgrp1 = perf_cgroup_from_task(task, NULL);
 857         cgrp2 = perf_cgroup_from_task(next, NULL);
 858
 859         /*
 860          * only schedule out current cgroup events if we know
 861          * that we are switching to a different cgroup. Otherwise,
 862          * do no touch the cgroup events.
 863          */
 864         if (cgrp1 != cgrp2)
 865                 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
 866
 867         rcu_read_unlock();
 868 }
 869
 870 static inline void perf_cgroup_sched_in(struct task_struct *prev,
 871                                         struct task_struct *task)
 872 {
 873         struct perf_cgroup *cgrp1;
 874         struct perf_cgroup *cgrp2 = NULL;
 875
 876         rcu_read_lock();
 877         /*
 878          * we come here when we know perf_cgroup_events > 0
 879          * we do not need to pass the ctx here because we know
 880          * we are holding the rcu lock
 881          */
 882         cgrp1 = perf_cgroup_from_task(task, NULL);
 883         cgrp2 = perf_cgroup_from_task(prev, NULL);
 884
 885         /*
 886          * only need to schedule in cgroup events if we are changing
 887          * cgroup during ctxsw. Cgroup events were not scheduled
 888          * out of ctxsw out if that was not the case.
 889          */
 890         if (cgrp1 != cgrp2)
 891                 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
 892
 893         rcu_read_unlock();
 894 }
 895
 896 static int perf_cgroup_ensure_storage(struct perf_event *event,
 897                                 struct cgroup_subsys_state *css)
 898 {
 899         struct perf_cpu_context *cpuctx;
 900         struct perf_event **storage;
 901         int cpu, heap_size, ret = 0;
 902
 903         /*
 904          * Allow storage to have sufficent space for an iterator for each
 905          * possibly nested cgroup plus an iterator for events with no cgroup.
 906          */
 907         for (heap_size = 1; css; css = css->parent)
 908                 heap_size++;
 909
 910         for_each_possible_cpu(cpu) {
 911                 cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
 912                 if (heap_size <= cpuctx->heap_size)
 913                         continue;
 914
 915                 storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
 916                                        GFP_KERNEL, cpu_to_node(cpu));
 917                 if (!storage) {
 918                         ret = -ENOMEM;
 919                         break;
 920                 }
 921
 922                 raw_spin_lock_irq(&cpuctx->ctx.lock);
 923                 if (cpuctx->heap_size < heap_size) {
 924                         swap(cpuctx->heap, storage);
 925                         if (storage == cpuctx->heap_default)
 926                                 storage = NULL;
 927                         cpuctx->heap_size = heap_size;
 928                 }
 929                 raw_spin_unlock_irq(&cpuctx->ctx.lock);
 930
 931                 kfree(storage);
 932         }
 933
 934         return ret;
 935 }
 936
 937 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 938                                       struct perf_event_attr *attr,
 939                                       struct perf_event *group_leader)
 940 {
 941         struct perf_cgroup *cgrp;
 942         struct cgroup_subsys_state *css;
 943         struct fd f = fdget(fd);
 944         int ret = 0;
 945
 946         if (!f.file)
 947                 return -EBADF;
 948
 949         css = css_tryget_online_from_dir(f.file->f_path.dentry,
 950                                          &perf_event_cgrp_subsys);
 951         if (IS_ERR(css)) {
 952                 ret = PTR_ERR(css);
 953                 goto out;
 954         }
 955
 956         ret = perf_cgroup_ensure_storage(event, css);
 957         if (ret)
 958                 goto out;
 959
 960         cgrp = container_of(css, struct perf_cgroup, css);
 961         event->cgrp = cgrp;
 962
 963         /*
 964          * all events in a group must monitor
 965          * the same cgroup because a task belongs
 966          * to only one perf cgroup at a time
 967          */
 968         if (group_leader && group_leader->cgrp != cgrp) {
 969                 perf_detach_cgroup(event);
 970                 ret = -EINVAL;
 971         }
 972 out:
 973         fdput(f);
 974         return ret;
 975 }
 976
 977 static inline void
 978 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 979 {
 980         struct perf_cgroup_info *t;
 981         t = per_cpu_ptr(event->cgrp->info, event->cpu);
 982         event->shadow_ctx_time = now - t->timestamp;
 983 }
 984
 985 /*
 986  * Update cpuctx->cgrp so that it is set when first cgroup event is added and
 987  * cleared when last cgroup event is removed.
 988  */
 989 static inline void
 990 list_update_cgroup_event(struct perf_event *event,
 991                          struct perf_event_context *ctx, bool add)
 992 {
 993         struct perf_cpu_context *cpuctx;
 994         struct list_head *cpuctx_entry;
 995
 996         if (!is_cgroup_event(event))
 997                 return;
 998
 999         /*
1000          * Because cgroup events are always per-cpu events,
1001          * @ctx == &cpuctx->ctx.
1002          */
1003         cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1004
1005         /*
1006          * Since setting cpuctx->cgrp is conditional on the current @cgrp
1007          * matching the event's cgroup, we must do this for every new event,
1008          * because if the first would mismatch, the second would not try again
1009          * and we would leave cpuctx->cgrp unset.
1010          */
1011         if (add && !cpuctx->cgrp) {
1012                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
1013
1014                 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
1015                         cpuctx->cgrp = cgrp;
1016         }
1017
1018         if (add && ctx->nr_cgroups++)
1019                 return;
1020         else if (!add && --ctx->nr_cgroups)
1021                 return;
1022
1023         /* no cgroup running */
1024         if (!add)
1025                 cpuctx->cgrp = NULL;
1026
1027         cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
1028         if (add)
1029                 list_add(cpuctx_entry,
1030                          per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
1031         else
1032                 list_del(cpuctx_entry);
1033 }
1034
1035 #else /* !CONFIG_CGROUP_PERF */
1036
1037 static inline bool
1038 perf_cgroup_match(struct perf_event *event)
1039 {
1040         return true;
1041 }
1042
1043 static inline void perf_detach_cgroup(struct perf_event *event)
1044 {}
1045
1046 static inline int is_cgroup_event(struct perf_event *event)
1047 {
1048         return 0;
1049 }
1050
1051 static inline void update_cgrp_time_from_event(struct perf_event *event)
1052 {
1053 }
1054
1055 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
1056 {
1057 }
1058
1059 static inline void perf_cgroup_sched_out(struct task_struct *task,
1060                                          struct task_struct *next)
1061 {
1062 }
1063
1064 static inline void perf_cgroup_sched_in(struct task_struct *prev,
1065                                         struct task_struct *task)
1066 {
1067 }
1068
1069 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1070                                       struct perf_event_attr *attr,
1071                                       struct perf_event *group_leader)
1072 {
1073         return -EINVAL;
1074 }
1075
1076 static inline void
1077 perf_cgroup_set_timestamp(struct task_struct *task,
1078                           struct perf_event_context *ctx)
1079 {
1080 }
1081
1082 static inline void
1083 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
1084 {
1085 }
1086
1087 static inline void
1088 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
1089 {
1090 }
1091
1092 static inline u64 perf_cgroup_event_time(struct perf_event *event)
1093 {
1094         return 0;
1095 }
1096
1097 static inline void
1098 list_update_cgroup_event(struct perf_event *event,
1099                          struct perf_event_context *ctx, bool add)
1100 {
1101 }
1102
1103 #endif
1104
1105 /*
1106  * set default to be dependent on timer tick just
1107  * like original code
1108  */
1109 #define PERF_CPU_HRTIMER (1000 / HZ)
1110 /*
1111  * function must be called with interrupts disabled
1112  */
1113 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1114 {
1115         struct perf_cpu_context *cpuctx;
1116         bool rotations;
1117
1118         lockdep_assert_irqs_disabled();
1119
1120         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1121         rotations = perf_rotate_context(cpuctx);
1122
1123         raw_spin_lock(&cpuctx->hrtimer_lock);
1124         if (rotations)
1125                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1126         else
1127                 cpuctx->hrtimer_active = 0;
1128         raw_spin_unlock(&cpuctx->hrtimer_lock);
1129
1130         return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1131 }
1132
1133 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1134 {
1135         struct hrtimer *timer = &cpuctx->hrtimer;
1136         struct pmu *pmu = cpuctx->ctx.pmu;
1137         u64 interval;
1138
1139         /* no multiplexing needed for SW PMU */
1140         if (pmu->task_ctx_nr == perf_sw_context)
1141                 return;
1142
1143         /*
1144          * check default is sane, if not set then force to
1145          * default interval (1/tick)
1146          */
1147         interval = pmu->hrtimer_interval_ms;
1148         if (interval < 1)
1149                 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1150
1151         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1152
1153         raw_spin_lock_init(&cpuctx->hrtimer_lock);
1154         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
1155         timer->function = perf_mux_hrtimer_handler;
1156 }
1157
1158 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1159 {
1160         struct hrtimer *timer = &cpuctx->hrtimer;
1161         struct pmu *pmu = cpuctx->ctx.pmu;
1162         unsigned long flags;
1163
1164         /* not for SW PMU */
1165         if (pmu->task_ctx_nr == perf_sw_context)
1166                 return 0;
1167
1168         raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1169         if (!cpuctx->hrtimer_active) {
1170                 cpuctx->hrtimer_active = 1;
1171                 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1172                 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
1173         }
1174         raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1175
1176         return 0;
1177 }
1178
1179 void perf_pmu_disable(struct pmu *pmu)
1180 {
1181         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1182         if (!(*count)++)
1183                 pmu->pmu_disable(pmu);
1184 }
1185
1186 void perf_pmu_enable(struct pmu *pmu)
1187 {
1188         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1189         if (!--(*count))
1190                 pmu->pmu_enable(pmu);
1191 }
1192
1193 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1194
1195 /*
1196  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
1197  * perf_event_task_tick() are fully serialized because they're strictly cpu
1198  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
1199  * disabled, while perf_event_task_tick is called from IRQ context.
1200  */
1201 static void perf_event_ctx_activate(struct perf_event_context *ctx)
1202 {
1203         struct list_head *head = this_cpu_ptr(&active_ctx_list);
1204
1205         lockdep_assert_irqs_disabled();
1206
1207         WARN_ON(!list_empty(&ctx->active_ctx_list));
1208
1209         list_add(&ctx->active_ctx_list, head);
1210 }
1211
1212 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1213 {
1214         lockdep_assert_irqs_disabled();
1215
1216         WARN_ON(list_empty(&ctx->active_ctx_list));
1217
1218         list_del_init(&ctx->active_ctx_list);
1219 }
1220
1221 static void get_ctx(struct perf_event_context *ctx)
1222 {
1223         refcount_inc(&ctx->refcount);
1224 }
1225
1226 static void free_ctx(struct rcu_head *head)
1227 {
1228         struct perf_event_context *ctx;
1229
1230         ctx = container_of(head, struct perf_event_context, rcu_head);
1231         kfree(ctx->task_ctx_data);
1232         kfree(ctx);
1233 }
1234
1235 static void put_ctx(struct perf_event_context *ctx)
1236 {
1237         if (refcount_dec_and_test(&ctx->refcount)) {
1238                 if (ctx->parent_ctx)
1239                         put_ctx(ctx->parent_ctx);
1240                 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1241                         put_task_struct(ctx->task);
1242                 call_rcu(&ctx->rcu_head, free_ctx);
1243         }
1244 }
1245
1246 /*
1247  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1248  * perf_pmu_migrate_context() we need some magic.
1249  *
1250  * Those places that change perf_event::ctx will hold both
1251  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1252  *
1253  * Lock ordering is by mutex address. There are two other sites where
1254  * perf_event_context::mutex nests and those are:
1255  *
1256  *  - perf_event_exit_task_context()    [ child , 0 ]
1257  *      perf_event_exit_event()
1258  *        put_event()                   [ parent, 1 ]
1259  *
1260  *  - perf_event_init_context()         [ parent, 0 ]
1261  *      inherit_task_group()
1262  *        inherit_group()
1263  *          inherit_event()
1264  *            perf_event_alloc()
1265  *              perf_init_event()
1266  *                perf_try_init_event() [ child , 1 ]
1267  *
1268  * While it appears there is an obvious deadlock here -- the parent and child
1269  * nesting levels are inverted between the two. This is in fact safe because
1270  * life-time rules separate them. That is an exiting task cannot fork, and a
1271  * spawning task cannot (yet) exit.
1272  *
1273  * But remember that that these are parent<->child context relations, and
1274  * migration does not affect children, therefore these two orderings should not
1275  * interact.
1276  *
1277  * The change in perf_event::ctx does not affect children (as claimed above)
1278  * because the sys_perf_event_open() case will install a new event and break
1279  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1280  * concerned with cpuctx and that doesn't have children.
1281  *
1282  * The places that change perf_event::ctx will issue:
1283  *
1284  *   perf_remove_from_context();
1285  *   synchronize_rcu();
1286  *   perf_install_in_context();
1287  *
1288  * to affect the change. The remove_from_context() + synchronize_rcu() should
1289  * quiesce the event, after which we can install it in the new location. This
1290  * means that only external vectors (perf_fops, prctl) can perturb the event
1291  * while in transit. Therefore all such accessors should also acquire
1292  * perf_event_context::mutex to serialize against this.
1293  *
1294  * However; because event->ctx can change while we're waiting to acquire
1295  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1296  * function.
1297  *
1298  * Lock order:
1299  *    exec_update_mutex
1300  *      task_struct::perf_event_mutex
1301  *        perf_event_context::mutex
1302  *          perf_event::child_mutex;
1303  *            perf_event_context::lock
1304  *          perf_event::mmap_mutex
1305  *          mmap_sem
1306  *            perf_addr_filters_head::lock
1307  *
1308  *    cpu_hotplug_lock
1309  *      pmus_lock
1310  *        cpuctx->mutex / perf_event_context::mutex
1311  */
1312 static struct perf_event_context *
1313 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1314 {
1315         struct perf_event_context *ctx;
1316
1317 again:
1318         rcu_read_lock();
1319         ctx = READ_ONCE(event->ctx);
1320         if (!refcount_inc_not_zero(&ctx->refcount)) {
1321                 rcu_read_unlock();
1322                 goto again;
1323         }
1324         rcu_read_unlock();
1325
1326         mutex_lock_nested(&ctx->mutex, nesting);
1327         if (event->ctx != ctx) {
1328                 mutex_unlock(&ctx->mutex);
1329                 put_ctx(ctx);
1330                 goto again;
1331         }
1332
1333         return ctx;
1334 }
1335
1336 static inline struct perf_event_context *
1337 perf_event_ctx_lock(struct perf_event *event)
1338 {
1339         return perf_event_ctx_lock_nested(event, 0);
1340 }
1341
1342 static void perf_event_ctx_unlock(struct perf_event *event,
1343                                   struct perf_event_context *ctx)
1344 {
1345         mutex_unlock(&ctx->mutex);
1346         put_ctx(ctx);
1347 }
1348
1349 /*
1350  * This must be done under the ctx->lock, such as to serialize against
1351  * context_equiv(), therefore we cannot call put_ctx() since that might end up
1352  * calling scheduler related locks and ctx->lock nests inside those.
1353  */
1354 static __must_check struct perf_event_context *
1355 unclone_ctx(struct perf_event_context *ctx)
1356 {
1357         struct perf_event_context *parent_ctx = ctx->parent_ctx;
1358
1359         lockdep_assert_held(&ctx->lock);
1360
1361         if (parent_ctx)
1362                 ctx->parent_ctx = NULL;
1363         ctx->generation++;
1364
1365         return parent_ctx;
1366 }
1367
1368 static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1369                                 enum pid_type type)
1370 {
1371         u32 nr;
1372         /*
1373          * only top level events have the pid namespace they were created in
1374          */
1375         if (event->parent)
1376                 event = event->parent;
1377
1378         nr = __task_pid_nr_ns(p, type, event->ns);
1379         /* avoid -1 if it is idle thread or runs in another ns */
1380         if (!nr && !pid_alive(p))
1381                 nr = -1;
1382         return nr;
1383 }
1384
1385 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1386 {
1387         return perf_event_pid_type(event, p, PIDTYPE_TGID);
1388 }
1389
1390 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1391 {
1392         return perf_event_pid_type(event, p, PIDTYPE_PID);
1393 }
1394
1395 /*
1396  * If we inherit events we want to return the parent event id
1397  * to userspace.
1398  */
1399 static u64 primary_event_id(struct perf_event *event)
1400 {
1401         u64 id = event->id;
1402
1403         if (event->parent)
1404                 id = event->parent->id;
1405
1406         return id;
1407 }
1408
1409 /*
1410  * Get the perf_event_context for a task and lock it.
1411  *
1412  * This has to cope with with the fact that until it is locked,
1413  * the context could get moved to another task.
1414  */
1415 static struct perf_event_context *
1416 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1417 {
1418         struct perf_event_context *ctx;
1419
1420 retry:
1421         /*
1422          * One of the few rules of preemptible RCU is that one cannot do
1423          * rcu_read_unlock() while holding a scheduler (or nested) lock when
1424          * part of the read side critical section was irqs-enabled -- see
1425          * rcu_read_unlock_special().
1426          *
1427          * Since ctx->lock nests under rq->lock we must ensure the entire read
1428          * side critical section has interrupts disabled.
1429          */
1430         local_irq_save(*flags);
1431         rcu_read_lock();
1432         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1433         if (ctx) {
1434                 /*
1435                  * If this context is a clone of another, it might
1436                  * get swapped for another underneath us by
1437                  * perf_event_task_sched_out, though the
1438                  * rcu_read_lock() protects us from any context
1439                  * getting freed.  Lock the context and check if it
1440                  * got swapped before we could get the lock, and retry
1441                  * if so.  If we locked the right context, then it
1442                  * can't get swapped on us any more.
1443                  */
1444                 raw_spin_lock(&ctx->lock);
1445                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1446                         raw_spin_unlock(&ctx->lock);
1447                         rcu_read_unlock();
1448                         local_irq_restore(*flags);
1449                         goto retry;
1450                 }
1451
1452                 if (ctx->task == TASK_TOMBSTONE ||
1453                     !refcount_inc_not_zero(&ctx->refcount)) {
1454                         raw_spin_unlock(&ctx->lock);
1455                         ctx = NULL;
1456                 } else {
1457                         WARN_ON_ONCE(ctx->task != task);
1458                 }
1459         }
1460         rcu_read_unlock();
1461         if (!ctx)
1462                 local_irq_restore(*flags);
1463         return ctx;
1464 }
1465
1466 /*
1467  * Get the context for a task and increment its pin_count so it
1468  * can't get swapped to another task.  This also increments its
1469  * reference count so that the context can't get freed.
1470  */
1471 static struct perf_event_context *
1472 perf_pin_task_context(struct task_struct *task, int ctxn)
1473 {
1474         struct perf_event_context *ctx;
1475         unsigned long flags;
1476
1477         ctx = perf_lock_task_context(task, ctxn, &flags);
1478         if (ctx) {
1479                 ++ctx->pin_count;
1480                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1481         }
1482         return ctx;
1483 }
1484
1485 static void perf_unpin_context(struct perf_event_context *ctx)
1486 {
1487         unsigned long flags;
1488
1489         raw_spin_lock_irqsave(&ctx->lock, flags);
1490         --ctx->pin_count;
1491         raw_spin_unlock_irqrestore(&ctx->lock, flags);
1492 }
1493
1494 /*
1495  * Update the record of the current time in a context.
1496  */
1497 static void update_context_time(struct perf_event_context *ctx)
1498 {
1499         u64 now = perf_clock();
1500
1501         ctx->time += now - ctx->timestamp;
1502         ctx->timestamp = now;
1503 }
1504
1505 static u64 perf_event_time(struct perf_event *event)
1506 {
1507         struct perf_event_context *ctx = event->ctx;
1508
1509         if (is_cgroup_event(event))
1510                 return perf_cgroup_event_time(event);
1511
1512         return ctx ? ctx->time : 0;
1513 }
1514
1515 static enum event_type_t get_event_type(struct perf_event *event)
1516 {
1517         struct perf_event_context *ctx = event->ctx;
1518         enum event_type_t event_type;
1519
1520         lockdep_assert_held(&ctx->lock);
1521
1522         /*
1523          * It's 'group type', really, because if our group leader is
1524          * pinned, so are we.
1525          */
1526         if (event->group_leader != event)
1527                 event = event->group_leader;
1528
1529         event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1530         if (!ctx->task)
1531                 event_type |= EVENT_CPU;
1532
1533         return event_type;
1534 }
1535
1536 /*
1537  * Helper function to initialize event group nodes.
1538  */
1539 static void init_event_group(struct perf_event *event)
1540 {
1541         RB_CLEAR_NODE(&event->group_node);
1542         event->group_index = 0;
1543 }
1544
1545 /*
1546  * Extract pinned or flexible groups from the context
1547  * based on event attrs bits.
1548  */
1549 static struct perf_event_groups *
1550 get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1551 {
1552         if (event->attr.pinned)
1553                 return &ctx->pinned_groups;
1554         else
1555                 return &ctx->flexible_groups;
1556 }
1557
1558 /*
1559  * Helper function to initializes perf_event_group trees.
1560  */
1561 static void perf_event_groups_init(struct perf_event_groups *groups)
1562 {
1563         groups->tree = RB_ROOT;
1564         groups->index = 0;
1565 }
1566
1567 /*
1568  * Compare function for event groups;
1569  *
1570  * Implements complex key that first sorts by CPU and then by virtual index
1571  * which provides ordering when rotating groups for the same CPU.
1572  */
1573 static bool
1574 perf_event_groups_less(struct perf_event *left, struct perf_event *right)
1575 {
1576         if (left->cpu < right->cpu)
1577                 return true;
1578         if (left->cpu > right->cpu)
1579                 return false;
1580
1581 #ifdef CONFIG_CGROUP_PERF
1582         if (left->cgrp != right->cgrp) {
1583                 if (!left->cgrp || !left->cgrp->css.cgroup) {
1584                         /*
1585                          * Left has no cgroup but right does, no cgroups come
1586                          * first.
1587                          */
1588                         return true;
1589                 }
1590                 if (!right->cgrp || !right->cgrp->css.cgroup) {
1591                         /*
1592                          * Right has no cgroup but left does, no cgroups come
1593                          * first.
1594                          */
1595                         return false;
1596                 }
1597                 /* Two dissimilar cgroups, order by id. */
1598                 if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
1599                         return true;
1600
1601                 return false;
1602         }
1603 #endif
1604
1605         if (left->group_index < right->group_index)
1606                 return true;
1607         if (left->group_index > right->group_index)
1608                 return false;
1609
1610         return false;
1611 }
1612
1613 /*
1614  * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
1615  * key (see perf_event_groups_less). This places it last inside the CPU
1616  * subtree.
1617  */
1618 static void
1619 perf_event_groups_insert(struct perf_event_groups *groups,
1620                          struct perf_event *event)
1621 {
1622         struct perf_event *node_event;
1623         struct rb_node *parent;
1624         struct rb_node **node;
1625
1626         event->group_index = ++groups->index;
1627
1628         node = &groups->tree.rb_node;
1629         parent = *node;
1630
1631         while (*node) {
1632                 parent = *node;
1633                 node_event = container_of(*node, struct perf_event, group_node);
1634
1635                 if (perf_event_groups_less(event, node_event))
1636                         node = &parent->rb_left;
1637                 else
1638                         node = &parent->rb_right;
1639         }
1640
1641         rb_link_node(&event->group_node, parent, node);
1642         rb_insert_color(&event->group_node, &groups->tree);
1643 }
1644
1645 /*
1646  * Helper function to insert event into the pinned or flexible groups.
1647  */
1648 static void
1649 add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1650 {
1651         struct perf_event_groups *groups;
1652
1653         groups = get_event_groups(event, ctx);
1654         perf_event_groups_insert(groups, event);
1655 }
1656
1657 /*
1658  * Delete a group from a tree.
1659  */
1660 static void
1661 perf_event_groups_delete(struct perf_event_groups *groups,
1662                          struct perf_event *event)
1663 {
1664         WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1665                      RB_EMPTY_ROOT(&groups->tree));
1666
1667         rb_erase(&event->group_node, &groups->tree);
1668         init_event_group(event);
1669 }
1670
1671 /*
1672  * Helper function to delete event from its groups.
1673  */
1674 static void
1675 del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1676 {
1677         struct perf_event_groups *groups;
1678
1679         groups = get_event_groups(event, ctx);
1680         perf_event_groups_delete(groups, event);
1681 }
1682
1683 /*
1684  * Get the leftmost event in the cpu/cgroup subtree.
1685  */
1686 static struct perf_event *
1687 perf_event_groups_first(struct perf_event_groups *groups, int cpu,
1688                         struct cgroup *cgrp)
1689 {
1690         struct perf_event *node_event = NULL, *match = NULL;
1691         struct rb_node *node = groups->tree.rb_node;
1692 #ifdef CONFIG_CGROUP_PERF
1693         u64 node_cgrp_id, cgrp_id = 0;
1694
1695         if (cgrp)
1696                 cgrp_id = cgrp->kn->id;
1697 #endif
1698
1699         while (node) {
1700                 node_event = container_of(node, struct perf_event, group_node);
1701
1702                 if (cpu < node_event->cpu) {
1703                         node = node->rb_left;
1704                         continue;
1705                 }
1706                 if (cpu > node_event->cpu) {
1707                         node = node->rb_right;
1708                         continue;
1709                 }
1710 #ifdef CONFIG_CGROUP_PERF
1711                 node_cgrp_id = 0;
1712                 if (node_event->cgrp && node_event->cgrp->css.cgroup)
1713                         node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
1714
1715                 if (cgrp_id < node_cgrp_id) {
1716                         node = node->rb_left;
1717                         continue;
1718                 }
1719                 if (cgrp_id > node_cgrp_id) {
1720                         node = node->rb_right;
1721                         continue;
1722                 }
1723 #endif
1724                 match = node_event;
1725                 node = node->rb_left;
1726         }
1727
1728         return match;
1729 }
1730
1731 /*
1732  * Like rb_entry_next_safe() for the @cpu subtree.
1733  */
1734 static struct perf_event *
1735 perf_event_groups_next(struct perf_event *event)
1736 {
1737         struct perf_event *next;
1738 #ifdef CONFIG_CGROUP_PERF
1739         u64 curr_cgrp_id = 0;
1740         u64 next_cgrp_id = 0;
1741 #endif
1742
1743         next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
1744         if (next == NULL || next->cpu != event->cpu)
1745                 return NULL;
1746
1747 #ifdef CONFIG_CGROUP_PERF
1748         if (event->cgrp && event->cgrp->css.cgroup)
1749                 curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
1750
1751         if (next->cgrp && next->cgrp->css.cgroup)
1752                 next_cgrp_id = next->cgrp->css.cgroup->kn->id;
1753
1754         if (curr_cgrp_id != next_cgrp_id)
1755                 return NULL;
1756 #endif
1757         return next;
1758 }
1759
1760 /*
1761  * Iterate through the whole groups tree.
1762  */
1763 #define perf_event_groups_for_each(event, groups)                       \
1764         for (event = rb_entry_safe(rb_first(&((groups)->tree)),         \
1765                                 typeof(*event), group_node); event;     \
1766                 event = rb_entry_safe(rb_next(&event->group_node),      \
1767                                 typeof(*event), group_node))
1768
1769 /*
1770  * Add an event from the lists for its context.
1771  * Must be called with ctx->mutex and ctx->lock held.
1772  */
1773 static void
1774 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1775 {
1776         lockdep_assert_held(&ctx->lock);
1777
1778         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1779         event->attach_state |= PERF_ATTACH_CONTEXT;
1780
1781         event->tstamp = perf_event_time(event);
1782
1783         /*
1784          * If we're a stand alone event or group leader, we go to the context
1785          * list, group events are kept attached to the group so that
1786          * perf_group_detach can, at all times, locate all siblings.
1787          */
1788         if (event->group_leader == event) {
1789                 event->group_caps = event->event_caps;
1790                 add_event_to_groups(event, ctx);
1791         }
1792
1793         list_update_cgroup_event(event, ctx, true);
1794
1795         list_add_rcu(&event->event_entry, &ctx->event_list);
1796         ctx->nr_events++;
1797         if (event->attr.inherit_stat)
1798                 ctx->nr_stat++;
1799
1800         ctx->generation++;
1801 }
1802
1803 /*
1804  * Initialize event state based on the perf_event_attr::disabled.
1805  */
1806 static inline void perf_event__state_init(struct perf_event *event)
1807 {
1808         event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1809                                               PERF_EVENT_STATE_INACTIVE;
1810 }
1811
1812 static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1813 {
1814         int entry = sizeof(u64); /* value */
1815         int size = 0;
1816         int nr = 1;
1817
1818         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1819                 size += sizeof(u64);
1820
1821         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1822                 size += sizeof(u64);
1823
1824         if (event->attr.read_format & PERF_FORMAT_ID)
1825                 entry += sizeof(u64);
1826
1827         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1828                 nr += nr_siblings;
1829                 size += sizeof(u64);
1830         }
1831
1832         size += entry * nr;
1833         event->read_size = size;
1834 }
1835
1836 static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1837 {
1838         struct perf_sample_data *data;
1839         u16 size = 0;
1840
1841         if (sample_type & PERF_SAMPLE_IP)
1842                 size += sizeof(data->ip);
1843
1844         if (sample_type & PERF_SAMPLE_ADDR)
1845                 size += sizeof(data->addr);
1846
1847         if (sample_type & PERF_SAMPLE_PERIOD)
1848                 size += sizeof(data->period);
1849
1850         if (sample_type & PERF_SAMPLE_WEIGHT)
1851                 size += sizeof(data->weight);
1852
1853         if (sample_type & PERF_SAMPLE_READ)
1854                 size += event->read_size;
1855
1856         if (sample_type & PERF_SAMPLE_DATA_SRC)
1857                 size += sizeof(data->data_src.val);
1858
1859         if (sample_type & PERF_SAMPLE_TRANSACTION)
1860                 size += sizeof(data->txn);
1861
1862         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1863                 size += sizeof(data->phys_addr);
1864
1865         if (sample_type & PERF_SAMPLE_CGROUP)
1866                 size += sizeof(data->cgroup);
1867
1868         event->header_size = size;
1869 }
1870
1871 /*
1872  * Called at perf_event creation and when events are attached/detached from a
1873  * group.
1874  */
1875 static void perf_event__header_size(struct perf_event *event)
1876 {
1877         __perf_event_read_size(event,
1878                                event->group_leader->nr_siblings);
1879         __perf_event_header_size(event, event->attr.sample_type);
1880 }
1881
1882 static void perf_event__id_header_size(struct perf_event *event)
1883 {
1884         struct perf_sample_data *data;
1885         u64 sample_type = event->attr.sample_type;
1886         u16 size = 0;
1887
1888         if (sample_type & PERF_SAMPLE_TID)
1889                 size += sizeof(data->tid_entry);
1890
1891         if (sample_type & PERF_SAMPLE_TIME)
1892                 size += sizeof(data->time);
1893
1894         if (sample_type & PERF_SAMPLE_IDENTIFIER)
1895                 size += sizeof(data->id);
1896
1897         if (sample_type & PERF_SAMPLE_ID)
1898                 size += sizeof(data->id);
1899
1900         if (sample_type & PERF_SAMPLE_STREAM_ID)
1901                 size += sizeof(data->stream_id);
1902
1903         if (sample_type & PERF_SAMPLE_CPU)
1904                 size += sizeof(data->cpu_entry);
1905
1906         event->id_header_size = size;
1907 }
1908
1909 static bool perf_event_validate_size(struct perf_event *event)
1910 {
1911         /*
1912          * The values computed here will be over-written when we actually
1913          * attach the event.
1914          */
1915         __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1916         __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1917         perf_event__id_header_size(event);
1918
1919         /*
1920          * Sum the lot; should not exceed the 64k limit we have on records.
1921          * Conservative limit to allow for callchains and other variable fields.
1922          */
1923         if (event->read_size + event->header_size +
1924             event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1925                 return false;
1926
1927         return true;
1928 }
1929
1930 static void perf_group_attach(struct perf_event *event)
1931 {
1932         struct perf_event *group_leader = event->group_leader, *pos;
1933
1934         lockdep_assert_held(&event->ctx->lock);
1935
1936         /*
1937          * We can have double attach due to group movement in perf_event_open.
1938          */
1939         if (event->attach_state & PERF_ATTACH_GROUP)
1940                 return;
1941
1942         event->attach_state |= PERF_ATTACH_GROUP;
1943
1944         if (group_leader == event)
1945                 return;
1946
1947         WARN_ON_ONCE(group_leader->ctx != event->ctx);
1948
1949         group_leader->group_caps &= event->event_caps;
1950
1951         list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1952         group_leader->nr_siblings++;
1953
1954         perf_event__header_size(group_leader);
1955
1956         for_each_sibling_event(pos, group_leader)
1957                 perf_event__header_size(pos);
1958 }
1959
1960 /*
1961  * Remove an event from the lists for its context.
1962  * Must be called with ctx->mutex and ctx->lock held.
1963  */
1964 static void
1965 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1966 {
1967         WARN_ON_ONCE(event->ctx != ctx);
1968         lockdep_assert_held(&ctx->lock);
1969
1970         /*
1971          * We can have double detach due to exit/hot-unplug + close.
1972          */
1973         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1974                 return;
1975
1976         event->attach_state &= ~PERF_ATTACH_CONTEXT;
1977
1978         list_update_cgroup_event(event, ctx, false);
1979
1980         ctx->nr_events--;
1981         if (event->attr.inherit_stat)
1982                 ctx->nr_stat--;
1983
1984         list_del_rcu(&event->event_entry);
1985
1986         if (event->group_leader == event)
1987                 del_event_from_groups(event, ctx);
1988
1989         /*
1990          * If event was in error state, then keep it
1991          * that way, otherwise bogus counts will be
1992          * returned on read(). The only way to get out
1993          * of error state is by explicit re-enabling
1994          * of the event
1995          */
1996         if (event->state > PERF_EVENT_STATE_OFF)
1997                 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
1998
1999         ctx->generation++;
2000 }
2001
2002 static int
2003 perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
2004 {
2005         if (!has_aux(aux_event))
2006                 return 0;
2007
2008         if (!event->pmu->aux_output_match)
2009                 return 0;
2010
2011         return event->pmu->aux_output_match(aux_event);
2012 }
2013
2014 static void put_event(struct perf_event *event);
2015 static void event_sched_out(struct perf_event *event,
2016                             struct perf_cpu_context *cpuctx,
2017                             struct perf_event_context *ctx);
2018
2019 static void perf_put_aux_event(struct perf_event *event)
2020 {
2021         struct perf_event_context *ctx = event->ctx;
2022         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2023         struct perf_event *iter;
2024
2025         /*
2026          * If event uses aux_event tear down the link
2027          */
2028         if (event->aux_event) {
2029                 iter = event->aux_event;
2030                 event->aux_event = NULL;
2031                 put_event(iter);
2032                 return;
2033         }
2034
2035         /*
2036          * If the event is an aux_event, tear down all links to
2037          * it from other events.
2038          */
2039         for_each_sibling_event(iter, event->group_leader) {
2040                 if (iter->aux_event != event)
2041                         continue;
2042
2043                 iter->aux_event = NULL;
2044                 put_event(event);
2045
2046                 /*
2047                  * If it's ACTIVE, schedule it out and put it into ERROR
2048                  * state so that we don't try to schedule it again. Note
2049                  * that perf_event_enable() will clear the ERROR status.
2050                  */
2051                 event_sched_out(iter, cpuctx, ctx);
2052                 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2053         }
2054 }
2055
2056 static bool perf_need_aux_event(struct perf_event *event)
2057 {
2058         return !!event->attr.aux_output || !!event->attr.aux_sample_size;
2059 }
2060
2061 static int perf_get_aux_event(struct perf_event *event,
2062                               struct perf_event *group_leader)
2063 {
2064         /*
2065          * Our group leader must be an aux event if we want to be
2066          * an aux_output. This way, the aux event will precede its
2067          * aux_output events in the group, and therefore will always
2068          * schedule first.
2069          */
2070         if (!group_leader)
2071                 return 0;
2072
2073         /*
2074          * aux_output and aux_sample_size are mutually exclusive.
2075          */
2076         if (event->attr.aux_output && event->attr.aux_sample_size)
2077                 return 0;
2078
2079         if (event->attr.aux_output &&
2080             !perf_aux_output_match(event, group_leader))
2081                 return 0;
2082
2083         if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
2084                 return 0;
2085
2086         if (!atomic_long_inc_not_zero(&group_leader->refcount))
2087                 return 0;
2088
2089         /*
2090          * Link aux_outputs to their aux event; this is undone in
2091          * perf_group_detach() by perf_put_aux_event(). When the
2092          * group in torn down, the aux_output events loose their
2093          * link to the aux_event and can't schedule any more.
2094          */
2095         event->aux_event = group_leader;
2096
2097         return 1;
2098 }
2099
2100 static inline struct list_head *get_event_list(struct perf_event *event)
2101 {
2102         struct perf_event_context *ctx = event->ctx;
2103         return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
2104 }
2105
2106 static void perf_group_detach(struct perf_event *event)
2107 {
2108         struct perf_event *sibling, *tmp;
2109         struct perf_event_context *ctx = event->ctx;
2110
2111         lockdep_assert_held(&ctx->lock);
2112
2113         /*
2114          * We can have double detach due to exit/hot-unplug + close.
2115          */
2116         if (!(event->attach_state & PERF_ATTACH_GROUP))
2117                 return;
2118
2119         event->attach_state &= ~PERF_ATTACH_GROUP;
2120
2121         perf_put_aux_event(event);
2122
2123         /*
2124          * If this is a sibling, remove it from its group.
2125          */
2126         if (event->group_leader != event) {
2127                 list_del_init(&event->sibling_list);
2128                 event->group_leader->nr_siblings--;
2129                 goto out;
2130         }
2131
2132         /*
2133          * If this was a group event with sibling events then
2134          * upgrade the siblings to singleton events by adding them
2135          * to whatever list we are on.
2136          */
2137         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
2138
2139                 sibling->group_leader = sibling;
2140                 list_del_init(&sibling->sibling_list);
2141
2142                 /* Inherit group flags from the previous leader */
2143                 sibling->group_caps = event->group_caps;
2144
2145                 if (!RB_EMPTY_NODE(&event->group_node)) {
2146                         add_event_to_groups(sibling, event->ctx);
2147
2148                         if (sibling->state == PERF_EVENT_STATE_ACTIVE)
2149                                 list_add_tail(&sibling->active_list, get_event_list(sibling));
2150                 }
2151
2152                 WARN_ON_ONCE(sibling->ctx != event->ctx);
2153         }
2154
2155 out:
2156         perf_event__header_size(event->group_leader);
2157
2158         for_each_sibling_event(tmp, event->group_leader)
2159                 perf_event__header_size(tmp);
2160 }
2161
2162 static bool is_orphaned_event(struct perf_event *event)
2163 {
2164         return event->state == PERF_EVENT_STATE_DEAD;
2165 }
2166
2167 static inline int __pmu_filter_match(struct perf_event *event)
2168 {
2169         struct pmu *pmu = event->pmu;
2170         return pmu->filter_match ? pmu->filter_match(event) : 1;
2171 }
2172
2173 /*
2174  * Check whether we should attempt to schedule an event group based on
2175  * PMU-specific filtering. An event group can consist of HW and SW events,
2176  * potentially with a SW leader, so we must check all the filters, to
2177  * determine whether a group is schedulable:
2178  */
2179 static inline int pmu_filter_match(struct perf_event *event)
2180 {
2181         struct perf_event *sibling;
2182
2183         if (!__pmu_filter_match(event))
2184                 return 0;
2185
2186         for_each_sibling_event(sibling, event) {
2187                 if (!__pmu_filter_match(sibling))
2188                         return 0;
2189         }
2190
2191         return 1;
2192 }
2193
2194 static inline int
2195 event_filter_match(struct perf_event *event)
2196 {
2197         return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
2198                perf_cgroup_match(event) && pmu_filter_match(event);
2199 }
2200
2201 static void
2202 event_sched_out(struct perf_event *event,
2203                   struct perf_cpu_context *cpuctx,
2204                   struct perf_event_context *ctx)
2205 {
2206         enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
2207
2208         WARN_ON_ONCE(event->ctx != ctx);
2209         lockdep_assert_held(&ctx->lock);
2210
2211         if (event->state != PERF_EVENT_STATE_ACTIVE)
2212                 return;
2213
2214         /*
2215          * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
2216          * we can schedule events _OUT_ individually through things like
2217          * __perf_remove_from_context().
2218          */
2219         list_del_init(&event->active_list);
2220
2221         perf_pmu_disable(event->pmu);
2222
2223         event->pmu->del(event, 0);
2224         event->oncpu = -1;
2225
2226         if (READ_ONCE(event->pending_disable) >= 0) {
2227                 WRITE_ONCE(event->pending_disable, -1);
2228                 state = PERF_EVENT_STATE_OFF;
2229         }
2230         perf_event_set_state(event, state);
2231
2232         if (!is_software_event(event))
2233                 cpuctx->active_oncpu--;
2234         if (!--ctx->nr_active)
2235                 perf_event_ctx_deactivate(ctx);
2236         if (event->attr.freq && event->attr.sample_freq)
2237                 ctx->nr_freq--;
2238         if (event->attr.exclusive || !cpuctx->active_oncpu)
2239                 cpuctx->exclusive = 0;
2240
2241         perf_pmu_enable(event->pmu);
2242 }
2243
2244 static void
2245 group_sched_out(struct perf_event *group_event,
2246                 struct perf_cpu_context *cpuctx,
2247                 struct perf_event_context *ctx)
2248 {
2249         struct perf_event *event;
2250
2251         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2252                 return;
2253
2254         perf_pmu_disable(ctx->pmu);
2255
2256         event_sched_out(group_event, cpuctx, ctx);
2257
2258         /*
2259          * Schedule out siblings (if any):
2260          */
2261         for_each_sibling_event(event, group_event)
2262                 event_sched_out(event, cpuctx, ctx);
2263
2264         perf_pmu_enable(ctx->pmu);
2265
2266         if (group_event->attr.exclusive)
2267                 cpuctx->exclusive = 0;
2268 }
2269
2270 #define DETACH_GROUP    0x01UL
2271
2272 /*
2273  * Cross CPU call to remove a performance event
2274  *
2275  * We disable the event on the hardware level first. After that we
2276  * remove it from the context list.
2277  */
2278 static void
2279 __perf_remove_from_context(struct perf_event *event,
2280                            struct perf_cpu_context *cpuctx,
2281                            struct perf_event_context *ctx,
2282                            void *info)
2283 {
2284         unsigned long flags = (unsigned long)info;
2285
2286         if (ctx->is_active & EVENT_TIME) {
2287                 update_context_time(ctx);
2288                 update_cgrp_time_from_cpuctx(cpuctx);
2289         }
2290
2291         event_sched_out(event, cpuctx, ctx);
2292         if (flags & DETACH_GROUP)
2293                 perf_group_detach(event);
2294         list_del_event(event, ctx);
2295
2296         if (!ctx->nr_events && ctx->is_active) {
2297                 ctx->is_active = 0;
2298                 ctx->rotate_necessary = 0;
2299                 if (ctx->task) {
2300                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2301                         cpuctx->task_ctx = NULL;
2302                 }
2303         }
2304 }
2305
2306 /*
2307  * Remove the event from a task's (or a CPU's) list of events.
2308  *
2309  * If event->ctx is a cloned context, callers must make sure that
2310  * every task struct that event->ctx->task could possibly point to
2311  * remains valid.  This is OK when called from perf_release since
2312  * that only calls us on the top-level context, which can't be a clone.
2313  * When called from perf_event_exit_task, it's OK because the
2314  * context has been detached from its task.
2315  */
2316 static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2317 {
2318         struct perf_event_context *ctx = event->ctx;
2319
2320         lockdep_assert_held(&ctx->mutex);
2321
2322         event_function_call(event, __perf_remove_from_context, (void *)flags);
2323
2324         /*
2325          * The above event_function_call() can NO-OP when it hits
2326          * TASK_TOMBSTONE. In that case we must already have been detached
2327          * from the context (by perf_event_exit_event()) but the grouping
2328          * might still be in-tact.
2329          */
2330         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
2331         if ((flags & DETACH_GROUP) &&
2332             (event->attach_state & PERF_ATTACH_GROUP)) {
2333                 /*
2334                  * Since in that case we cannot possibly be scheduled, simply
2335                  * detach now.
2336                  */
2337                 raw_spin_lock_irq(&ctx->lock);
2338                 perf_group_detach(event);
2339                 raw_spin_unlock_irq(&ctx->lock);
2340         }
2341 }
2342
2343 /*
2344  * Cross CPU call to disable a performance event
2345  */
2346 static void __perf_event_disable(struct perf_event *event,
2347                                  struct perf_cpu_context *cpuctx,
2348                                  struct perf_event_context *ctx,
2349                                  void *info)
2350 {
2351         if (event->state < PERF_EVENT_STATE_INACTIVE)
2352                 return;
2353
2354         if (ctx->is_active & EVENT_TIME) {
2355                 update_context_time(ctx);
2356                 update_cgrp_time_from_event(event);
2357         }
2358
2359         if (event == event->group_leader)
2360                 group_sched_out(event, cpuctx, ctx);
2361         else
2362                 event_sched_out(event, cpuctx, ctx);
2363
2364         perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2365 }
2366
2367 /*
2368  * Disable an event.
2369  *
2370  * If event->ctx is a cloned context, callers must make sure that
2371  * every task struct that event->ctx->task could possibly point to
2372  * remains valid.  This condition is satisfied when called through
2373  * perf_event_for_each_child or perf_event_for_each because they
2374  * hold the top-level event's child_mutex, so any descendant that
2375  * goes to exit will block in perf_event_exit_event().
2376  *
2377  * When called from perf_pending_event it's OK because event->ctx
2378  * is the current context on this CPU and preemption is disabled,
2379  * hence we can't get into perf_event_task_sched_out for this context.
2380  */
2381 static void _perf_event_disable(struct perf_event *event)
2382 {
2383         struct perf_event_context *ctx = event->ctx;
2384
2385         raw_spin_lock_irq(&ctx->lock);
2386         if (event->state <= PERF_EVENT_STATE_OFF) {
2387                 raw_spin_unlock_irq(&ctx->lock);
2388                 return;
2389         }
2390         raw_spin_unlock_irq(&ctx->lock);
2391
2392         event_function_call(event, __perf_event_disable, NULL);
2393 }
2394
2395 void perf_event_disable_local(struct perf_event *event)
2396 {
2397         event_function_local(event, __perf_event_disable, NULL);
2398 }
2399
2400 /*
2401  * Strictly speaking kernel users cannot create groups and therefore this
2402  * interface does not need the perf_event_ctx_lock() magic.
2403  */
2404 void perf_event_disable(struct perf_event *event)
2405 {
2406         struct perf_event_context *ctx;
2407
2408         ctx = perf_event_ctx_lock(event);
2409         _perf_event_disable(event);
2410         perf_event_ctx_unlock(event, ctx);
2411 }
2412 EXPORT_SYMBOL_GPL(perf_event_disable);
2413
2414 void perf_event_disable_inatomic(struct perf_event *event)
2415 {
2416         WRITE_ONCE(event->pending_disable, smp_processor_id());
2417         /* can fail, see perf_pending_event_disable() */
2418         irq_work_queue(&event->pending);
2419 }
2420
2421 static void perf_set_shadow_time(struct perf_event *event,
2422                                  struct perf_event_context *ctx)
2423 {
2424         /*
2425          * use the correct time source for the time snapshot
2426          *
2427          * We could get by without this by leveraging the
2428          * fact that to get to this function, the caller
2429          * has most likely already called update_context_time()
2430          * and update_cgrp_time_xx() and thus both timestamp
2431          * are identical (or very close). Given that tstamp is,
2432          * already adjusted for cgroup, we could say that:
2433          *    tstamp - ctx->timestamp
2434          * is equivalent to
2435          *    tstamp - cgrp->timestamp.
2436          *
2437          * Then, in perf_output_read(), the calculation would
2438          * work with no changes because:
2439          * - event is guaranteed scheduled in
2440          * - no scheduled out in between
2441          * - thus the timestamp would be the same
2442          *
2443          * But this is a bit hairy.
2444          *
2445          * So instead, we have an explicit cgroup call to remain
2446          * within the time time source all along. We believe it
2447          * is cleaner and simpler to understand.
2448          */
2449         if (is_cgroup_event(event))
2450                 perf_cgroup_set_shadow_time(event, event->tstamp);
2451         else
2452                 event->shadow_ctx_time = event->tstamp - ctx->timestamp;
2453 }
2454
2455 #define MAX_INTERRUPTS (~0ULL)
2456
2457 static void perf_log_throttle(struct perf_event *event, int enable);
2458 static void perf_log_itrace_start(struct perf_event *event);
2459
2460 static int
2461 event_sched_in(struct perf_event *event,
2462                  struct perf_cpu_context *cpuctx,
2463                  struct perf_event_context *ctx)
2464 {
2465         int ret = 0;
2466
2467         WARN_ON_ONCE(event->ctx != ctx);
2468
2469         lockdep_assert_held(&ctx->lock);
2470
2471         if (event->state <= PERF_EVENT_STATE_OFF)
2472                 return 0;
2473
2474         WRITE_ONCE(event->oncpu, smp_processor_id());
2475         /*
2476          * Order event::oncpu write to happen before the ACTIVE state is
2477          * visible. This allows perf_event_{stop,read}() to observe the correct
2478          * ->oncpu if it sees ACTIVE.
2479          */
2480         smp_wmb();
2481         perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2482
2483         /*
2484          * Unthrottle events, since we scheduled we might have missed several
2485          * ticks already, also for a heavily scheduling task there is little
2486          * guarantee it'll get a tick in a timely manner.
2487          */
2488         if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2489                 perf_log_throttle(event, 1);
2490                 event->hw.interrupts = 0;
2491         }
2492
2493         perf_pmu_disable(event->pmu);
2494
2495         perf_set_shadow_time(event, ctx);
2496
2497         perf_log_itrace_start(event);
2498
2499         if (event->pmu->add(event, PERF_EF_START)) {
2500                 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2501                 event->oncpu = -1;
2502                 ret = -EAGAIN;
2503                 goto out;
2504         }
2505
2506         if (!is_software_event(event))
2507                 cpuctx->active_oncpu++;
2508         if (!ctx->nr_active++)
2509                 perf_event_ctx_activate(ctx);
2510         if (event->attr.freq && event->attr.sample_freq)
2511                 ctx->nr_freq++;
2512
2513         if (event->attr.exclusive)
2514                 cpuctx->exclusive = 1;
2515
2516 out:
2517         perf_pmu_enable(event->pmu);
2518
2519         return ret;
2520 }
2521
2522 static int
2523 group_sched_in(struct perf_event *group_event,
2524                struct perf_cpu_context *cpuctx,
2525                struct perf_event_context *ctx)
2526 {
2527         struct perf_event *event, *partial_group = NULL;
2528         struct pmu *pmu = ctx->pmu;
2529
2530         if (group_event->state == PERF_EVENT_STATE_OFF)
2531                 return 0;
2532
2533         pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2534
2535         if (event_sched_in(group_event, cpuctx, ctx)) {
2536                 pmu->cancel_txn(pmu);
2537                 perf_mux_hrtimer_restart(cpuctx);
2538                 return -EAGAIN;
2539         }
2540
2541         /*
2542          * Schedule in siblings as one group (if any):
2543          */
2544         for_each_sibling_event(event, group_event) {
2545                 if (event_sched_in(event, cpuctx, ctx)) {
2546                         partial_group = event;
2547                         goto group_error;
2548                 }
2549         }
2550
2551         if (!pmu->commit_txn(pmu))
2552                 return 0;
2553
2554 group_error:
2555         /*
2556          * Groups can be scheduled in as one unit only, so undo any
2557          * partial group before returning:
2558          * The events up to the failed event are scheduled out normally.
2559          */
2560         for_each_sibling_event(event, group_event) {
2561                 if (event == partial_group)
2562                         break;
2563
2564                 event_sched_out(event, cpuctx, ctx);
2565         }
2566         event_sched_out(group_event, cpuctx, ctx);
2567
2568         pmu->cancel_txn(pmu);
2569
2570         perf_mux_hrtimer_restart(cpuctx);
2571
2572         return -EAGAIN;
2573 }
2574
2575 /*
2576  * Work out whether we can put this event group on the CPU now.
2577  */
2578 static int group_can_go_on(struct perf_event *event,
2579                            struct perf_cpu_context *cpuctx,
2580                            int can_add_hw)
2581 {
2582         /*
2583          * Groups consisting entirely of software events can always go on.
2584          */
2585         if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2586                 return 1;
2587         /*
2588          * If an exclusive group is already on, no other hardware
2589          * events can go on.
2590          */
2591         if (cpuctx->exclusive)
2592                 return 0;
2593         /*
2594          * If this group is exclusive and there are already
2595          * events on the CPU, it can't go on.
2596          */
2597         if (event->attr.exclusive && cpuctx->active_oncpu)
2598                 return 0;
2599         /*
2600          * Otherwise, try to add it if all previous groups were able
2601          * to go on.
2602          */
2603         return can_add_hw;
2604 }
2605
2606 static void add_event_to_ctx(struct perf_event *event,
2607                                struct perf_event_context *ctx)
2608 {
2609         list_add_event(event, ctx);
2610         perf_group_attach(event);
2611 }
2612
2613 static void ctx_sched_out(struct perf_event_context *ctx,
2614                           struct perf_cpu_context *cpuctx,
2615                           enum event_type_t event_type);
2616 static void
2617 ctx_sched_in(struct perf_event_context *ctx,
2618              struct perf_cpu_context *cpuctx,
2619              enum event_type_t event_type,
2620              struct task_struct *task);
2621
2622 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2623                                struct perf_event_context *ctx,
2624                                enum event_type_t event_type)
2625 {
2626         if (!cpuctx->task_ctx)
2627                 return;
2628
2629         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2630                 return;
2631
2632         ctx_sched_out(ctx, cpuctx, event_type);
2633 }
2634
2635 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2636                                 struct perf_event_context *ctx,
2637                                 struct task_struct *task)
2638 {
2639         cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2640         if (ctx)
2641                 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2642         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2643         if (ctx)
2644                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2645 }
2646
2647 /*
2648  * We want to maintain the following priority of scheduling:
2649  *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
2650  *  - task pinned (EVENT_PINNED)
2651  *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
2652  *  - task flexible (EVENT_FLEXIBLE).
2653  *
2654  * In order to avoid unscheduling and scheduling back in everything every
2655  * time an event is added, only do it for the groups of equal priority and
2656  * below.
2657  *
2658  * This can be called after a batch operation on task events, in which case
2659  * event_type is a bit mask of the types of events involved. For CPU events,
2660  * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
2661  */
2662 static void ctx_resched(struct perf_cpu_context *cpuctx,
2663                         struct perf_event_context *task_ctx,
2664                         enum event_type_t event_type)
2665 {
2666         enum event_type_t ctx_event_type;
2667         bool cpu_event = !!(event_type & EVENT_CPU);
2668
2669         /*
2670          * If pinned groups are involved, flexible groups also need to be
2671          * scheduled out.
2672          */
2673         if (event_type & EVENT_PINNED)
2674                 event_type |= EVENT_FLEXIBLE;
2675
2676         ctx_event_type = event_type & EVENT_ALL;
2677
2678         perf_pmu_disable(cpuctx->ctx.pmu);
2679         if (task_ctx)
2680                 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2681
2682         /*
2683          * Decide which cpu ctx groups to schedule out based on the types
2684          * of events that caused rescheduling:
2685          *  - EVENT_CPU: schedule out corresponding groups;
2686          *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
2687          *  - otherwise, do nothing more.
2688          */
2689         if (cpu_event)
2690                 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2691         else if (ctx_event_type & EVENT_PINNED)
2692                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2693
2694         perf_event_sched_in(cpuctx, task_ctx, current);
2695         perf_pmu_enable(cpuctx->ctx.pmu);
2696 }
2697
2698 void perf_pmu_resched(struct pmu *pmu)
2699 {
2700         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2701         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2702
2703         perf_ctx_lock(cpuctx, task_ctx);
2704         ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2705         perf_ctx_unlock(cpuctx, task_ctx);
2706 }
2707
2708 /*
2709  * Cross CPU call to install and enable a performance event
2710  *
2711  * Very similar to remote_function() + event_function() but cannot assume that
2712  * things like ctx->is_active and cpuctx->task_ctx are set.
2713  */
2714 static int  __perf_install_in_context(void *info)
2715 {
2716         struct perf_event *event = info;
2717         struct perf_event_context *ctx = event->ctx;
2718         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2719         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2720         bool reprogram = true;
2721         int ret = 0;
2722
2723         raw_spin_lock(&cpuctx->ctx.lock);
2724         if (ctx->task) {
2725                 raw_spin_lock(&ctx->lock);
2726                 task_ctx = ctx;
2727
2728                 reprogram = (ctx->task == current);
2729
2730                 /*
2731                  * If the task is running, it must be running on this CPU,
2732                  * otherwise we cannot reprogram things.
2733                  *
2734                  * If its not running, we don't care, ctx->lock will
2735                  * serialize against it becoming runnable.
2736                  */
2737                 if (task_curr(ctx->task) && !reprogram) {
2738                         ret = -ESRCH;
2739                         goto unlock;
2740                 }
2741
2742                 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2743         } else if (task_ctx) {
2744                 raw_spin_lock(&task_ctx->lock);
2745         }
2746
2747 #ifdef CONFIG_CGROUP_PERF
2748         if (is_cgroup_event(event)) {
2749                 /*
2750                  * If the current cgroup doesn't match the event's
2751                  * cgroup, we should not try to schedule it.
2752                  */
2753                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2754                 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2755                                         event->cgrp->css.cgroup);
2756         }
2757 #endif
2758
2759         if (reprogram) {
2760                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2761                 add_event_to_ctx(event, ctx);
2762                 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2763         } else {
2764                 add_event_to_ctx(event, ctx);
2765         }
2766
2767 unlock:
2768         perf_ctx_unlock(cpuctx, task_ctx);
2769
2770         return ret;
2771 }
2772
2773 static bool exclusive_event_installable(struct perf_event *event,
2774                                         struct perf_event_context *ctx);
2775
2776 /*
2777  * Attach a performance event to a context.
2778  *
2779  * Very similar to event_function_call, see comment there.
2780  */
2781 static void
2782 perf_install_in_context(struct perf_event_context *ctx,
2783                         struct perf_event *event,
2784                         int cpu)
2785 {
2786         struct task_struct *task = READ_ONCE(ctx->task);
2787
2788         lockdep_assert_held(&ctx->mutex);
2789
2790         WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
2791
2792         if (event->cpu != -1)
2793                 event->cpu = cpu;
2794
2795         /*
2796          * Ensures that if we can observe event->ctx, both the event and ctx
2797          * will be 'complete'. See perf_iterate_sb_cpu().
2798          */
2799         smp_store_release(&event->ctx, ctx);
2800
2801         /*
2802          * perf_event_attr::disabled events will not run and can be initialized
2803          * without IPI. Except when this is the first event for the context, in
2804          * that case we need the magic of the IPI to set ctx->is_active.
2805          *
2806          * The IOC_ENABLE that is sure to follow the creation of a disabled
2807          * event will issue the IPI and reprogram the hardware.
2808          */
2809         if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
2810                 raw_spin_lock_irq(&ctx->lock);
2811                 if (ctx->task == TASK_TOMBSTONE) {
2812                         raw_spin_unlock_irq(&ctx->lock);
2813                         return;
2814                 }
2815                 add_event_to_ctx(event, ctx);
2816                 raw_spin_unlock_irq(&ctx->lock);
2817                 return;
2818         }
2819
2820         if (!task) {
2821                 cpu_function_call(cpu, __perf_install_in_context, event);
2822                 return;
2823         }
2824
2825         /*
2826          * Should not happen, we validate the ctx is still alive before calling.
2827          */
2828         if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2829                 return;
2830
2831         /*
2832          * Installing events is tricky because we cannot rely on ctx->is_active
2833          * to be set in case this is the nr_events 0 -> 1 transition.
2834          *
2835          * Instead we use task_curr(), which tells us if the task is running.
2836          * However, since we use task_curr() outside of rq::lock, we can race
2837          * against the actual state. This means the result can be wrong.
2838          *
2839          * If we get a false positive, we retry, this is harmless.
2840          *
2841          * If we get a false negative, things are complicated. If we are after
2842          * perf_event_context_sched_in() ctx::lock will serialize us, and the
2843          * value must be correct. If we're before, it doesn't matter since
2844          * perf_event_context_sched_in() will program the counter.
2845          *
2846          * However, this hinges on the remote context switch having observed
2847          * our task->perf_event_ctxp[] store, such that it will in fact take
2848          * ctx::lock in perf_event_context_sched_in().
2849          *
2850          * We do this by task_function_call(), if the IPI fails to hit the task
2851          * we know any future context switch of task must see the
2852          * perf_event_ctpx[] store.
2853          */
2854
2855         /*
2856          * This smp_mb() orders the task->perf_event_ctxp[] store with the
2857          * task_cpu() load, such that if the IPI then does not find the task
2858          * running, a future context switch of that task must observe the
2859          * store.
2860          */
2861         smp_mb();
2862 again:
2863         if (!task_function_call(task, __perf_install_in_context, event))
2864                 return;
2865
2866         raw_spin_lock_irq(&ctx->lock);
2867         task = ctx->task;
2868         if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2869                 /*
2870                  * Cannot happen because we already checked above (which also
2871                  * cannot happen), and we hold ctx->mutex, which serializes us
2872                  * against perf_event_exit_task_context().
2873                  */
2874                 raw_spin_unlock_irq(&ctx->lock);
2875                 return;
2876         }
2877         /*
2878          * If the task is not running, ctx->lock will avoid it becoming so,
2879          * thus we can safely install the event.
2880          */
2881         if (task_curr(task)) {
2882                 raw_spin_unlock_irq(&ctx->lock);
2883                 goto again;
2884         }
2885         add_event_to_ctx(event, ctx);
2886         raw_spin_unlock_irq(&ctx->lock);
2887 }
2888
2889 /*
2890  * Cross CPU call to enable a performance event
2891  */
2892 static void __perf_event_enable(struct perf_event *event,
2893                                 struct perf_cpu_context *cpuctx,
2894                                 struct perf_event_context *ctx,
2895                                 void *info)
2896 {
2897         struct perf_event *leader = event->group_leader;
2898         struct perf_event_context *task_ctx;
2899
2900         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2901             event->state <= PERF_EVENT_STATE_ERROR)
2902                 return;
2903
2904         if (ctx->is_active)
2905                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2906
2907         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2908
2909         if (!ctx->is_active)
2910                 return;
2911
2912         if (!event_filter_match(event)) {
2913                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2914                 return;
2915         }
2916
2917         /*
2918          * If the event is in a group and isn't the group leader,
2919          * then don't put it on unless the group is on.
2920          */
2921         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2922                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2923                 return;
2924         }
2925
2926         task_ctx = cpuctx->task_ctx;
2927         if (ctx->task)
2928                 WARN_ON_ONCE(task_ctx != ctx);
2929
2930         ctx_resched(cpuctx, task_ctx, get_event_type(event));
2931 }
2932
2933 /*
2934  * Enable an event.
2935  *
2936  * If event->ctx is a cloned context, callers must make sure that
2937  * every task struct that event->ctx->task could possibly point to
2938  * remains valid.  This condition is satisfied when called through
2939  * perf_event_for_each_child or perf_event_for_each as described
2940  * for perf_event_disable.
2941  */
2942 static void _perf_event_enable(struct perf_event *event)
2943 {
2944         struct perf_event_context *ctx = event->ctx;
2945
2946         raw_spin_lock_irq(&ctx->lock);
2947         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2948             event->state <  PERF_EVENT_STATE_ERROR) {
2949                 raw_spin_unlock_irq(&ctx->lock);
2950                 return;
2951         }
2952
2953         /*
2954          * If the event is in error state, clear that first.
2955          *
2956          * That way, if we see the event in error state below, we know that it
2957          * has gone back into error state, as distinct from the task having
2958          * been scheduled away before the cross-call arrived.
2959          */
2960         if (event->state == PERF_EVENT_STATE_ERROR)
2961                 event->state = PERF_EVENT_STATE_OFF;
2962         raw_spin_unlock_irq(&ctx->lock);
2963
2964         event_function_call(event, __perf_event_enable, NULL);
2965 }
2966
2967 /*
2968  * See perf_event_disable();
2969  */
2970 void perf_event_enable(struct perf_event *event)
2971 {
2972         struct perf_event_context *ctx;
2973
2974         ctx = perf_event_ctx_lock(event);
2975         _perf_event_enable(event);
2976         perf_event_ctx_unlock(event, ctx);
2977 }
2978 EXPORT_SYMBOL_GPL(perf_event_enable);
2979
2980 struct stop_event_data {
2981         struct perf_event       *event;
2982         unsigned int            restart;
2983 };
2984
2985 static int __perf_event_stop(void *info)
2986 {
2987         struct stop_event_data *sd = info;
2988         struct perf_event *event = sd->event;
2989
2990         /* if it's already INACTIVE, do nothing */
2991         if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2992                 return 0;
2993
2994         /* matches smp_wmb() in event_sched_in() */
2995         smp_rmb();
2996
2997         /*
2998          * There is a window with interrupts enabled before we get here,
2999          * so we need to check again lest we try to stop another CPU's event.
3000          */
3001         if (READ_ONCE(event->oncpu) != smp_processor_id())
3002                 return -EAGAIN;
3003
3004         event->pmu->stop(event, PERF_EF_UPDATE);
3005
3006         /*
3007          * May race with the actual stop (through perf_pmu_output_stop()),
3008          * but it is only used for events with AUX ring buffer, and such
3009          * events will refuse to restart because of rb::aux_mmap_count==0,
3010          * see comments in perf_aux_output_begin().
3011          *
3012          * Since this is happening on an event-local CPU, no trace is lost
3013          * while restarting.
3014          */
3015         if (sd->restart)
3016                 event->pmu->start(event, 0);
3017
3018         return 0;
3019 }
3020
3021 static int perf_event_stop(struct perf_event *event, int restart)
3022 {
3023         struct stop_event_data sd = {
3024                 .event          = event,
3025                 .restart        = restart,
3026         };
3027         int ret = 0;
3028
3029         do {
3030                 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3031                         return 0;
3032
3033                 /* matches smp_wmb() in event_sched_in() */
3034                 smp_rmb();
3035
3036                 /*
3037                  * We only want to restart ACTIVE events, so if the event goes
3038                  * inactive here (event->oncpu==-1), there's nothing more to do;
3039                  * fall through with ret==-ENXIO.
3040                  */
3041                 ret = cpu_function_call(READ_ONCE(event->oncpu),
3042                                         __perf_event_stop, &sd);
3043         } while (ret == -EAGAIN);
3044
3045         return ret;
3046 }
3047
3048 /*
3049  * In order to contain the amount of racy and tricky in the address filter
3050  * configuration management, it is a two part process:
3051  *
3052  * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
3053  *      we update the addresses of corresponding vmas in
3054  *      event::addr_filter_ranges array and bump the event::addr_filters_gen;
3055  * (p2) when an event is scheduled in (pmu::add), it calls
3056  *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
3057  *      if the generation has changed since the previous call.
3058  *
3059  * If (p1) happens while the event is active, we restart it to force (p2).
3060  *
3061  * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
3062  *     pre-existing mappings, called once when new filters arrive via SET_FILTER
3063  *     ioctl;
3064  * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
3065  *     registered mapping, called for every new mmap(), with mm::mmap_sem down
3066  *     for reading;
3067  * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
3068  *     of exec.
3069  */
3070 void perf_event_addr_filters_sync(struct perf_event *event)
3071 {
3072         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
3073
3074         if (!has_addr_filter(event))
3075                 return;
3076
3077         raw_spin_lock(&ifh->lock);
3078         if (event->addr_filters_gen != event->hw.addr_filters_gen) {
3079                 event->pmu->addr_filters_sync(event);
3080                 event->hw.addr_filters_gen = event->addr_filters_gen;
3081         }
3082         raw_spin_unlock(&ifh->lock);
3083 }
3084 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
3085
3086 static int _perf_event_refresh(struct perf_event *event, int refresh)
3087 {
3088         /*
3089          * not supported on inherited events
3090          */
3091         if (event->attr.inherit || !is_sampling_event(event))
3092                 return -EINVAL;
3093
3094         atomic_add(refresh, &event->event_limit);
3095         _perf_event_enable(event);
3096
3097         return 0;
3098 }
3099
3100 /*
3101  * See perf_event_disable()
3102  */
3103 int perf_event_refresh(struct perf_event *event, int refresh)
3104 {
3105         struct perf_event_context *ctx;
3106         int ret;
3107
3108         ctx = perf_event_ctx_lock(event);
3109         ret = _perf_event_refresh(event, refresh);
3110         perf_event_ctx_unlock(event, ctx);
3111
3112         return ret;
3113 }
3114 EXPORT_SYMBOL_GPL(perf_event_refresh);
3115
3116 static int perf_event_modify_breakpoint(struct perf_event *bp,
3117                                          struct perf_event_attr *attr)
3118 {
3119         int err;
3120
3121         _perf_event_disable(bp);
3122
3123         err = modify_user_hw_breakpoint_check(bp, attr, true);
3124
3125         if (!bp->attr.disabled)
3126                 _perf_event_enable(bp);
3127
3128         return err;
3129 }
3130
3131 static int perf_event_modify_attr(struct perf_event *event,
3132                                   struct perf_event_attr *attr)
3133 {
3134         if (event->attr.type != attr->type)
3135                 return -EINVAL;
3136
3137         switch (event->attr.type) {
3138         case PERF_TYPE_BREAKPOINT:
3139                 return perf_event_modify_breakpoint(event, attr);
3140         default:
3141                 /* Place holder for future additions. */
3142                 return -EOPNOTSUPP;
3143         }
3144 }
3145
3146 static void ctx_sched_out(struct perf_event_context *ctx,
3147                           struct perf_cpu_context *cpuctx,
3148                           enum event_type_t event_type)
3149 {
3150         struct perf_event *event, *tmp;
3151         int is_active = ctx->is_active;
3152
3153         lockdep_assert_held(&ctx->lock);
3154
3155         if (likely(!ctx->nr_events)) {
3156                 /*
3157                  * See __perf_remove_from_context().
3158                  */
3159                 WARN_ON_ONCE(ctx->is_active);
3160                 if (ctx->task)
3161                         WARN_ON_ONCE(cpuctx->task_ctx);
3162                 return;
3163         }
3164
3165         ctx->is_active &= ~event_type;
3166         if (!(ctx->is_active & EVENT_ALL))
3167                 ctx->is_active = 0;
3168
3169         if (ctx->task) {
3170                 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3171                 if (!ctx->is_active)
3172                         cpuctx->task_ctx = NULL;
3173         }
3174
3175         /*
3176          * Always update time if it was set; not only when it changes.
3177          * Otherwise we can 'forget' to update time for any but the last
3178          * context we sched out. For example:
3179          *
3180          *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
3181          *   ctx_sched_out(.event_type = EVENT_PINNED)
3182          *
3183          * would only update time for the pinned events.
3184          */
3185         if (is_active & EVENT_TIME) {
3186                 /* update (and stop) ctx time */
3187                 update_context_time(ctx);
3188                 update_cgrp_time_from_cpuctx(cpuctx);
3189         }
3190
3191         is_active ^= ctx->is_active; /* changed bits */
3192
3193         if (!ctx->nr_active || !(is_active & EVENT_ALL))
3194                 return;
3195
3196         perf_pmu_disable(ctx->pmu);
3197         if (is_active & EVENT_PINNED) {
3198                 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
3199                         group_sched_out(event, cpuctx, ctx);
3200         }
3201
3202         if (is_active & EVENT_FLEXIBLE) {
3203                 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
3204                         group_sched_out(event, cpuctx, ctx);
3205
3206                 /*
3207                  * Since we cleared EVENT_FLEXIBLE, also clear
3208                  * rotate_necessary, is will be reset by
3209                  * ctx_flexible_sched_in() when needed.
3210                  */
3211                 ctx->rotate_necessary = 0;
3212         }
3213         perf_pmu_enable(ctx->pmu);
3214 }
3215
3216 /*
3217  * Test whether two contexts are equivalent, i.e. whether they have both been
3218  * cloned from the same version of the same context.
3219  *
3220  * Equivalence is measured using a generation number in the context that is
3221  * incremented on each modification to it; see unclone_ctx(), list_add_event()
3222  * and list_del_event().
3223  */
3224 static int context_equiv(struct perf_event_context *ctx1,
3225                          struct perf_event_context *ctx2)
3226 {
3227         lockdep_assert_held(&ctx1->lock);
3228         lockdep_assert_held(&ctx2->lock);
3229
3230         /* Pinning disables the swap optimization */
3231         if (ctx1->pin_count || ctx2->pin_count)
3232                 return 0;
3233
3234         /* If ctx1 is the parent of ctx2 */
3235         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
3236                 return 1;
3237
3238         /* If ctx2 is the parent of ctx1 */
3239         if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3240                 return 1;
3241
3242         /*
3243          * If ctx1 and ctx2 have the same parent; we flatten the parent
3244          * hierarchy, see perf_event_init_context().
3245          */
3246         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3247                         ctx1->parent_gen == ctx2->parent_gen)
3248                 return 1;
3249
3250         /* Unmatched */
3251         return 0;
3252 }
3253
3254 static void __perf_event_sync_stat(struct perf_event *event,
3255                                      struct perf_event *next_event)
3256 {
3257         u64 value;
3258
3259         if (!event->attr.inherit_stat)
3260                 return;
3261
3262         /*
3263          * Update the event value, we cannot use perf_event_read()
3264          * because we're in the middle of a context switch and have IRQs
3265          * disabled, which upsets smp_call_function_single(), however
3266          * we know the event must be on the current CPU, therefore we
3267          * don't need to use it.
3268          */
3269         if (event->state == PERF_EVENT_STATE_ACTIVE)
3270                 event->pmu->read(event);
3271
3272         perf_event_update_time(event);
3273
3274         /*
3275          * In order to keep per-task stats reliable we need to flip the event
3276          * values when we flip the contexts.
3277          */
3278         value = local64_read(&next_event->count);
3279         value = local64_xchg(&event->count, value);
3280         local64_set(&next_event->count, value);
3281
3282         swap(event->total_time_enabled, next_event->total_time_enabled);
3283         swap(event->total_time_running, next_event->total_time_running);
3284
3285         /*
3286          * Since we swizzled the values, update the user visible data too.
3287          */
3288         perf_event_update_userpage(event);
3289         perf_event_update_userpage(next_event);
3290 }
3291
3292 static void perf_event_sync_stat(struct perf_event_context *ctx,
3293                                    struct perf_event_context *next_ctx)
3294 {
3295         struct perf_event *event, *next_event;
3296
3297         if (!ctx->nr_stat)
3298                 return;
3299
3300         update_context_time(ctx);
3301
3302         event = list_first_entry(&ctx->event_list,
3303                                    struct perf_event, event_entry);
3304
3305         next_event = list_first_entry(&next_ctx->event_list,
3306                                         struct perf_event, event_entry);
3307
3308         while (&event->event_entry != &ctx->event_list &&
3309                &next_event->event_entry != &next_ctx->event_list) {
3310
3311                 __perf_event_sync_stat(event, next_event);
3312
3313                 event = list_next_entry(event, event_entry);
3314                 next_event = list_next_entry(next_event, event_entry);
3315         }
3316 }
3317
3318 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3319                                          struct task_struct *next)
3320 {
3321         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
3322         struct perf_event_context *next_ctx;
3323         struct perf_event_context *parent, *next_parent;
3324         struct perf_cpu_context *cpuctx;
3325         int do_switch = 1;
3326
3327         if (likely(!ctx))
3328                 return;
3329
3330         cpuctx = __get_cpu_context(ctx);
3331         if (!cpuctx->task_ctx)
3332                 return;
3333
3334         rcu_read_lock();
3335         next_ctx = next->perf_event_ctxp[ctxn];
3336         if (!next_ctx)
3337                 goto unlock;
3338
3339         parent = rcu_dereference(ctx->parent_ctx);
3340         next_parent = rcu_dereference(next_ctx->parent_ctx);
3341
3342         /* If neither context have a parent context; they cannot be clones. */
3343         if (!parent && !next_parent)
3344                 goto unlock;
3345
3346         if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3347                 /*
3348                  * Looks like the two contexts are clones, so we might be
3349                  * able to optimize the context switch.  We lock both
3350                  * contexts and check that they are clones under the
3351                  * lock (including re-checking that neither has been
3352                  * uncloned in the meantime).  It doesn't matter which
3353                  * order we take the locks because no other cpu could
3354                  * be trying to lock both of these tasks.
3355                  */
3356                 raw_spin_lock(&ctx->lock);
3357                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3358                 if (context_equiv(ctx, next_ctx)) {
3359                         struct pmu *pmu = ctx->pmu;
3360
3361                         WRITE_ONCE(ctx->task, next);
3362                         WRITE_ONCE(next_ctx->task, task);
3363
3364                         /*
3365                          * PMU specific parts of task perf context can require
3366                          * additional synchronization. As an example of such
3367                          * synchronization see implementation details of Intel
3368                          * LBR call stack data profiling;
3369                          */
3370                         if (pmu->swap_task_ctx)
3371                                 pmu->swap_task_ctx(ctx, next_ctx);
3372                         else
3373                                 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3374
3375                         /*
3376                          * RCU_INIT_POINTER here is safe because we've not
3377                          * modified the ctx and the above modification of
3378                          * ctx->task and ctx->task_ctx_data are immaterial
3379                          * since those values are always verified under
3380                          * ctx->lock which we're now holding.
3381                          */
3382                         RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3383                         RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3384
3385                         do_switch = 0;
3386
3387                         perf_event_sync_stat(ctx, next_ctx);
3388                 }
3389                 raw_spin_unlock(&next_ctx->lock);
3390                 raw_spin_unlock(&ctx->lock);
3391         }
3392 unlock:
3393         rcu_read_unlock();
3394
3395         if (do_switch) {
3396                 raw_spin_lock(&ctx->lock);
3397                 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3398                 raw_spin_unlock(&ctx->lock);
3399         }
3400 }
3401
3402 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3403
3404 void perf_sched_cb_dec(struct pmu *pmu)
3405 {
3406         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3407
3408         this_cpu_dec(perf_sched_cb_usages);
3409
3410         if (!--cpuctx->sched_cb_usage)
3411                 list_del(&cpuctx->sched_cb_entry);
3412 }
3413
3414
3415 void perf_sched_cb_inc(struct pmu *pmu)
3416 {
3417         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3418
3419         if (!cpuctx->sched_cb_usage++)
3420                 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3421
3422         this_cpu_inc(perf_sched_cb_usages);
3423 }
3424
3425 /*
3426  * This function provides the context switch callback to the lower code
3427  * layer. It is invoked ONLY when the context switch callback is enabled.
3428  *
3429  * This callback is relevant even to per-cpu events; for example multi event
3430  * PEBS requires this to provide PID/TID information. This requires we flush
3431  * all queued PEBS records before we context switch to a new task.
3432  */
3433 static void perf_pmu_sched_task(struct task_struct *prev,
3434                                 struct task_struct *next,
3435                                 bool sched_in)
3436 {
3437         struct perf_cpu_context *cpuctx;
3438         struct pmu *pmu;
3439
3440         if (prev == next)
3441                 return;
3442
3443         list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3444                 pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3445
3446                 if (WARN_ON_ONCE(!pmu->sched_task))
3447                         continue;
3448
3449                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3450                 perf_pmu_disable(pmu);
3451
3452                 pmu->sched_task(cpuctx->task_ctx, sched_in);
3453
3454                 perf_pmu_enable(pmu);
3455                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3456         }
3457 }
3458
3459 static void perf_event_switch(struct task_struct *task,
3460                               struct task_struct *next_prev, bool sched_in);
3461
3462 #define for_each_task_context_nr(ctxn)                                  \
3463         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3464
3465 /*
3466  * Called from scheduler to remove the events of the current task,
3467  * with interrupts disabled.
3468  *
3469  * We stop each event and update the event value in event->count.
3470  *
3471  * This does not protect us against NMI, but disable()
3472  * sets the disabled bit in the control field of event _before_
3473  * accessing the event control register. If a NMI hits, then it will
3474  * not restart the event.
3475  */
3476 void __perf_event_task_sched_out(struct task_struct *task,
3477                                  struct task_struct *next)
3478 {
3479         int ctxn;
3480
3481         if (__this_cpu_read(perf_sched_cb_usages))
3482                 perf_pmu_sched_task(task, next, false);
3483
3484         if (atomic_read(&nr_switch_events))
3485                 perf_event_switch(task, next, false);
3486
3487         for_each_task_context_nr(ctxn)
3488                 perf_event_context_sched_out(task, ctxn, next);
3489
3490         /*
3491          * if cgroup events exist on this CPU, then we need
3492          * to check if we have to switch out PMU state.
3493          * cgroup event are system-wide mode only
3494          */
3495         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3496                 perf_cgroup_sched_out(task, next);
3497 }
3498
3499 /*
3500  * Called with IRQs disabled
3501  */
3502 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3503                               enum event_type_t event_type)
3504 {
3505         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3506 }
3507
3508 static bool perf_less_group_idx(const void *l, const void *r)
3509 {
3510         const struct perf_event *le = l, *re = r;
3511
3512         return le->group_index < re->group_index;
3513 }
3514
3515 static void swap_ptr(void *l, void *r)
3516 {
3517         void **lp = l, **rp = r;
3518
3519         swap(*lp, *rp);
3520 }
3521
3522 static const struct min_heap_callbacks perf_min_heap = {
3523         .elem_size = sizeof(struct perf_event *),
3524         .less = perf_less_group_idx,
3525         .swp = swap_ptr,
3526 };
3527
3528 static void __heap_add(struct min_heap *heap, struct perf_event *event)
3529 {
3530         struct perf_event **itrs = heap->data;
3531
3532         if (event) {
3533                 itrs[heap->nr] = event;
3534                 heap->nr++;
3535         }
3536 }
3537
3538 static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
3539                                 struct perf_event_groups *groups, int cpu,
3540                                 int (*func)(struct perf_event *, void *),
3541                                 void *data)
3542 {
3543 #ifdef CONFIG_CGROUP_PERF
3544         struct cgroup_subsys_state *css = NULL;
3545 #endif
3546         /* Space for per CPU and/or any CPU event iterators. */
3547         struct perf_event *itrs[2];
3548         struct min_heap event_heap;
3549         struct perf_event **evt;
3550         int ret;
3551
3552         if (cpuctx) {
3553                 event_heap = (struct min_heap){
3554                         .data = cpuctx->heap,
3555                         .nr = 0,
3556                         .size = cpuctx->heap_size,
3557                 };
3558
3559                 lockdep_assert_held(&cpuctx->ctx.lock);
3560
3561 #ifdef CONFIG_CGROUP_PERF
3562                 if (cpuctx->cgrp)
3563                         css = &cpuctx->cgrp->css;
3564 #endif
3565         } else {
3566                 event_heap = (struct min_heap){
3567                         .data = itrs,
3568                         .nr = 0,
3569                         .size = ARRAY_SIZE(itrs),
3570                 };
3571                 /* Events not within a CPU context may be on any CPU. */
3572                 __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
3573         }
3574         evt = event_heap.data;
3575
3576         __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
3577
3578 #ifdef CONFIG_CGROUP_PERF
3579         for (; css; css = css->parent)
3580                 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
3581 #endif
3582
3583         min_heapify_all(&event_heap, &perf_min_heap);
3584
3585         while (event_heap.nr) {
3586                 ret = func(*evt, data);
3587                 if (ret)
3588                         return ret;
3589
3590                 *evt = perf_event_groups_next(*evt);
3591                 if (*evt)
3592                         min_heapify(&event_heap, 0, &perf_min_heap);
3593                 else
3594                         min_heap_pop(&event_heap, &perf_min_heap);
3595         }
3596
3597         return 0;
3598 }
3599
3600 static int merge_sched_in(struct perf_event *event, void *data)
3601 {
3602         struct perf_event_context *ctx = event->ctx;
3603         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3604         int *can_add_hw = data;
3605
3606         if (event->state <= PERF_EVENT_STATE_OFF)
3607                 return 0;
3608
3609         if (!event_filter_match(event))
3610                 return 0;
3611
3612         if (group_can_go_on(event, cpuctx, *can_add_hw)) {
3613                 if (!group_sched_in(event, cpuctx, ctx))
3614                         list_add_tail(&event->active_list, get_event_list(event));
3615         }
3616
3617         if (event->state == PERF_EVENT_STATE_INACTIVE) {
3618                 if (event->attr.pinned)
3619                         perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3620
3621                 *can_add_hw = 0;
3622                 ctx->rotate_necessary = 1;
3623         }
3624
3625         return 0;
3626 }
3627
3628 static void
3629 ctx_pinned_sched_in(struct perf_event_context *ctx,
3630                     struct perf_cpu_context *cpuctx)
3631 {
3632         int can_add_hw = 1;
3633
3634         if (ctx != &cpuctx->ctx)
3635                 cpuctx = NULL;
3636
3637         visit_groups_merge(cpuctx, &ctx->pinned_groups,
3638                            smp_processor_id(),
3639                            merge_sched_in, &can_add_hw);
3640 }
3641
3642 static void
3643 ctx_flexible_sched_in(struct perf_event_context *ctx,
3644                       struct perf_cpu_context *cpuctx)
3645 {
3646         int can_add_hw = 1;
3647
3648         if (ctx != &cpuctx->ctx)
3649                 cpuctx = NULL;
3650
3651         visit_groups_merge(cpuctx, &ctx->flexible_groups,
3652                            smp_processor_id(),
3653                            merge_sched_in, &can_add_hw);
3654 }
3655
3656 static void
3657 ctx_sched_in(struct perf_event_context *ctx,
3658              struct perf_cpu_context *cpuctx,
3659              enum event_type_t event_type,
3660              struct task_struct *task)
3661 {
3662         int is_active = ctx->is_active;
3663         u64 now;
3664
3665         lockdep_assert_held(&ctx->lock);
3666
3667         if (likely(!ctx->nr_events))
3668                 return;
3669
3670         ctx->is_active |= (event_type | EVENT_TIME);
3671         if (ctx->task) {
3672                 if (!is_active)
3673                         cpuctx->task_ctx = ctx;
3674                 else
3675                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3676         }
3677
3678         is_active ^= ctx->is_active; /* changed bits */
3679
3680         if (is_active & EVENT_TIME) {
3681                 /* start ctx time */
3682                 now = perf_clock();
3683                 ctx->timestamp = now;
3684                 perf_cgroup_set_timestamp(task, ctx);
3685         }
3686
3687         /*
3688          * First go through the list and put on any pinned groups
3689          * in order to give them the best chance of going on.
3690          */
3691         if (is_active & EVENT_PINNED)
3692                 ctx_pinned_sched_in(ctx, cpuctx);
3693
3694         /* Then walk through the lower prio flexible groups */
3695         if (is_active & EVENT_FLEXIBLE)
3696                 ctx_flexible_sched_in(ctx, cpuctx);
3697 }
3698
3699 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3700                              enum event_type_t event_type,
3701                              struct task_struct *task)
3702 {
3703         struct perf_event_context *ctx = &cpuctx->ctx;
3704
3705         ctx_sched_in(ctx, cpuctx, event_type, task);
3706 }
3707
3708 static void perf_event_context_sched_in(struct perf_event_context *ctx,
3709                                         struct task_struct *task)
3710 {
3711         struct perf_cpu_context *cpuctx;
3712
3713         cpuctx = __get_cpu_context(ctx);
3714         if (cpuctx->task_ctx == ctx)
3715                 return;
3716
3717         perf_ctx_lock(cpuctx, ctx);
3718         /*
3719          * We must check ctx->nr_events while holding ctx->lock, such
3720          * that we serialize against perf_install_in_context().
3721          */
3722         if (!ctx->nr_events)
3723                 goto unlock;
3724
3725         perf_pmu_disable(ctx->pmu);
3726         /*
3727          * We want to keep the following priority order:
3728          * cpu pinned (that don't need to move), task pinned,
3729          * cpu flexible, task flexible.
3730          *
3731          * However, if task's ctx is not carrying any pinned
3732          * events, no need to flip the cpuctx's events around.
3733          */
3734         if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3735                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3736         perf_event_sched_in(cpuctx, ctx, task);
3737         perf_pmu_enable(ctx->pmu);
3738
3739 unlock:
3740         perf_ctx_unlock(cpuctx, ctx);
3741 }
3742
3743 /*
3744  * Called from scheduler to add the events of the current task
3745  * with interrupts disabled.
3746  *
3747  * We restore the event value and then enable it.
3748  *
3749  * This does not protect us against NMI, but enable()
3750  * sets the enabled bit in the control field of event _before_
3751  * accessing the event control register. If a NMI hits, then it will
3752  * keep the event running.
3753  */
3754 void __perf_event_task_sched_in(struct task_struct *prev,
3755                                 struct task_struct *task)
3756 {
3757         struct perf_event_context *ctx;
3758         int ctxn;
3759
3760         /*
3761          * If cgroup events exist on this CPU, then we need to check if we have
3762          * to switch in PMU state; cgroup event are system-wide mode only.
3763          *
3764          * Since cgroup events are CPU events, we must schedule these in before
3765          * we schedule in the task events.
3766          */
3767         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3768                 perf_cgroup_sched_in(prev, task);
3769
3770         for_each_task_context_nr(ctxn) {
3771                 ctx = task->perf_event_ctxp[ctxn];
3772                 if (likely(!ctx))
3773                         continue;
3774
3775                 perf_event_context_sched_in(ctx, task);
3776         }
3777
3778         if (atomic_read(&nr_switch_events))
3779                 perf_event_switch(task, prev, true);
3780
3781         if (__this_cpu_read(perf_sched_cb_usages))
3782                 perf_pmu_sched_task(prev, task, true);
3783 }
3784
3785 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3786 {
3787         u64 frequency = event->attr.sample_freq;
3788         u64 sec = NSEC_PER_SEC;
3789         u64 divisor, dividend;
3790
3791         int count_fls, nsec_fls, frequency_fls, sec_fls;
3792
3793         count_fls = fls64(count);
3794         nsec_fls = fls64(nsec);
3795         frequency_fls = fls64(frequency);
3796         sec_fls = 30;
3797
3798         /*
3799          * We got @count in @nsec, with a target of sample_freq HZ
3800          * the target period becomes:
3801          *
3802          *             @count * 10^9
3803          * period = -------------------
3804          *          @nsec * sample_freq
3805          *
3806          */
3807
3808         /*
3809          * Reduce accuracy by one bit such that @a and @b converge
3810          * to a similar magnitude.
3811          */
3812 #define REDUCE_FLS(a, b)                \
3813 do {                                    \
3814         if (a##_fls > b##_fls) {        \
3815                 a >>= 1;                \
3816                 a##_fls--;              \
3817         } else {                        \
3818                 b >>= 1;                \
3819                 b##_fls--;              \
3820         }                               \
3821 } while (0)
3822
3823         /*
3824          * Reduce accuracy until either term fits in a u64, then proceed with
3825          * the other, so that finally we can do a u64/u64 division.
3826          */
3827         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3828                 REDUCE_FLS(nsec, frequency);
3829                 REDUCE_FLS(sec, count);
3830         }
3831
3832         if (count_fls + sec_fls > 64) {
3833                 divisor = nsec * frequency;
3834
3835                 while (count_fls + sec_fls > 64) {
3836                         REDUCE_FLS(count, sec);
3837                         divisor >>= 1;
3838                 }
3839
3840                 dividend = count * sec;
3841         } else {
3842                 dividend = count * sec;
3843
3844                 while (nsec_fls + frequency_fls > 64) {
3845                         REDUCE_FLS(nsec, frequency);
3846                         dividend >>= 1;
3847                 }
3848
3849                 divisor = nsec * frequency;
3850         }
3851
3852         if (!divisor)
3853                 return dividend;
3854
3855         return div64_u64(dividend, divisor);
3856 }
3857
3858 static DEFINE_PER_CPU(int, perf_throttled_count);
3859 static DEFINE_PER_CPU(u64, perf_throttled_seq);
3860
3861 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3862 {
3863         struct hw_perf_event *hwc = &event->hw;
3864         s64 period, sample_period;
3865         s64 delta;
3866
3867         period = perf_calculate_period(event, nsec, count);
3868
3869         delta = (s64)(period - hwc->sample_period);
3870         delta = (delta + 7) / 8; /* low pass filter */
3871
3872         sample_period = hwc->sample_period + delta;
3873
3874         if (!sample_period)
3875                 sample_period = 1;
3876
3877         hwc->sample_period = sample_period;
3878
3879         if (local64_read(&hwc->period_left) > 8*sample_period) {
3880                 if (disable)
3881                         event->pmu->stop(event, PERF_EF_UPDATE);
3882
3883                 local64_set(&hwc->period_left, 0);
3884
3885                 if (disable)
3886                         event->pmu->start(event, PERF_EF_RELOAD);
3887         }
3888 }
3889
3890 /*
3891  * combine freq adjustment with unthrottling to avoid two passes over the
3892  * events. At the same time, make sure, having freq events does not change
3893  * the rate of unthrottling as that would introduce bias.
3894  */
3895 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3896                                            int needs_unthr)
3897 {
3898         struct perf_event *event;
3899         struct hw_perf_event *hwc;
3900         u64 now, period = TICK_NSEC;
3901         s64 delta;
3902
3903         /*
3904          * only need to iterate over all events iff:
3905          * - context have events in frequency mode (needs freq adjust)
3906          * - there are events to unthrottle on this cpu
3907          */
3908         if (!(ctx->nr_freq || needs_unthr))
3909                 return;
3910
3911         raw_spin_lock(&ctx->lock);
3912         perf_pmu_disable(ctx->pmu);
3913
3914         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3915                 if (event->state != PERF_EVENT_STATE_ACTIVE)
3916                         continue;
3917
3918                 if (!event_filter_match(event))
3919                         continue;
3920
3921                 perf_pmu_disable(event->pmu);
3922
3923                 hwc = &event->hw;
3924
3925                 if (hwc->interrupts == MAX_INTERRUPTS) {
3926                         hwc->interrupts = 0;
3927                         perf_log_throttle(event, 1);
3928                         event->pmu->start(event, 0);
3929                 }
3930
3931                 if (!event->attr.freq || !event->attr.sample_freq)
3932                         goto next;
3933
3934                 /*
3935                  * stop the event and update event->count
3936                  */
3937                 event->pmu->stop(event, PERF_EF_UPDATE);
3938
3939                 now = local64_read(&event->count);
3940                 delta = now - hwc->freq_count_stamp;
3941                 hwc->freq_count_stamp = now;
3942
3943                 /*
3944                  * restart the event
3945                  * reload only if value has changed
3946                  * we have stopped the event so tell that
3947                  * to perf_adjust_period() to avoid stopping it
3948                  * twice.
3949                  */
3950                 if (delta > 0)
3951                         perf_adjust_period(event, period, delta, false);
3952
3953                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3954         next:
3955                 perf_pmu_enable(event->pmu);
3956         }
3957
3958         perf_pmu_enable(ctx->pmu);
3959         raw_spin_unlock(&ctx->lock);
3960 }
3961
3962 /*
3963  * Move @event to the tail of the @ctx's elegible events.
3964  */
3965 static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
3966 {
3967         /*
3968          * Rotate the first entry last of non-pinned groups. Rotation might be
3969          * disabled by the inheritance code.
3970          */
3971         if (ctx->rotate_disable)
3972                 return;
3973
3974         perf_event_groups_delete(&ctx->flexible_groups, event);
3975         perf_event_groups_insert(&ctx->flexible_groups, event);
3976 }
3977
3978 /* pick an event from the flexible_groups to rotate */
3979 static inline struct perf_event *
3980 ctx_event_to_rotate(struct perf_event_context *ctx)
3981 {
3982         struct perf_event *event;
3983
3984         /* pick the first active flexible event */
3985         event = list_first_entry_or_null(&ctx->flexible_active,
3986                                          struct perf_event, active_list);
3987
3988         /* if no active flexible event, pick the first event */
3989         if (!event) {
3990                 event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
3991                                       typeof(*event), group_node);
3992         }
3993
3994         /*
3995          * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
3996          * finds there are unschedulable events, it will set it again.
3997          */
3998         ctx->rotate_necessary = 0;
3999
4000         return event;
4001 }
4002
4003 static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
4004 {
4005         struct perf_event *cpu_event = NULL, *task_event = NULL;
4006         struct perf_event_context *task_ctx = NULL;
4007         int cpu_rotate, task_rotate;
4008
4009         /*
4010          * Since we run this from IRQ context, nobody can install new
4011          * events, thus the event count values are stable.
4012          */
4013
4014         cpu_rotate = cpuctx->ctx.rotate_necessary;
4015         task_ctx = cpuctx->task_ctx;
4016         task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
4017
4018         if (!(cpu_rotate || task_rotate))
4019                 return false;
4020
4021         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
4022         perf_pmu_disable(cpuctx->ctx.pmu);
4023
4024         if (task_rotate)
4025                 task_event = ctx_event_to_rotate(task_ctx);
4026         if (cpu_rotate)
4027                 cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
4028
4029         /*
4030          * As per the order given at ctx_resched() first 'pop' task flexible
4031          * and then, if needed CPU flexible.
4032          */
4033         if (task_event || (task_ctx && cpu_event))
4034                 ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
4035         if (cpu_event)
4036                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
4037
4038         if (task_event)
4039                 rotate_ctx(task_ctx, task_event);
4040         if (cpu_event)
4041                 rotate_ctx(&cpuctx->ctx, cpu_event);
4042
4043         perf_event_sched_in(cpuctx, task_ctx, current);
4044
4045         perf_pmu_enable(cpuctx->ctx.pmu);
4046         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
4047
4048         return true;
4049 }
4050
4051 void perf_event_task_tick(void)
4052 {
4053         struct list_head *head = this_cpu_ptr(&active_ctx_list);
4054         struct perf_event_context *ctx, *tmp;
4055         int throttled;
4056
4057         lockdep_assert_irqs_disabled();
4058
4059         __this_cpu_inc(perf_throttled_seq);
4060         throttled = __this_cpu_xchg(perf_throttled_count, 0);
4061         tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
4062
4063         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
4064                 perf_adjust_freq_unthr_context(ctx, throttled);
4065 }
4066
4067 static int event_enable_on_exec(struct perf_event *event,
4068                                 struct perf_event_context *ctx)
4069 {
4070         if (!event->attr.enable_on_exec)
4071                 return 0;
4072
4073         event->attr.enable_on_exec = 0;
4074         if (event->state >= PERF_EVENT_STATE_INACTIVE)
4075                 return 0;
4076
4077         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
4078
4079         return 1;
4080 }
4081
4082 /*
4083  * Enable all of a task's events that have been marked enable-on-exec.
4084  * This expects task == current.
4085  */
4086 static void perf_event_enable_on_exec(int ctxn)
4087 {
4088         struct perf_event_context *ctx, *clone_ctx = NULL;
4089         enum event_type_t event_type = 0;
4090         struct perf_cpu_context *cpuctx;
4091         struct perf_event *event;
4092         unsigned long flags;
4093         int enabled = 0;
4094
4095         local_irq_save(flags);
4096         ctx = current->perf_event_ctxp[ctxn];
4097         if (!ctx || !ctx->nr_events)
4098                 goto out;
4099
4100         cpuctx = __get_cpu_context(ctx);
4101         perf_ctx_lock(cpuctx, ctx);
4102         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
4103         list_for_each_entry(event, &ctx->event_list, event_entry) {
4104                 enabled |= event_enable_on_exec(event, ctx);
4105                 event_type |= get_event_type(event);
4106         }
4107
4108         /*
4109          * Unclone and reschedule this context if we enabled any event.
4110          */
4111         if (enabled) {
4112                 clone_ctx = unclone_ctx(ctx);
4113                 ctx_resched(cpuctx, ctx, event_type);
4114         } else {
4115                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
4116         }
4117         perf_ctx_unlock(cpuctx, ctx);
4118
4119 out:
4120         local_irq_restore(flags);
4121
4122         if (clone_ctx)
4123                 put_ctx(clone_ctx);
4124 }
4125
4126 struct perf_read_data {
4127         struct perf_event *event;
4128         bool group;
4129         int ret;
4130 };
4131
4132 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
4133 {
4134         u16 local_pkg, event_pkg;
4135
4136         if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
4137                 int local_cpu = smp_processor_id();
4138
4139                 event_pkg = topology_physical_package_id(event_cpu);
4140                 local_pkg = topology_physical_package_id(local_cpu);
4141
4142                 if (event_pkg == local_pkg)
4143                         return local_cpu;
4144         }
4145
4146         return event_cpu;
4147 }
4148
4149 /*
4150  * Cross CPU call to read the hardware event
4151  */
4152 static void __perf_event_read(void *info)
4153 {
4154         struct perf_read_data *data = info;
4155         struct perf_event *sub, *event = data->event;
4156         struct perf_event_context *ctx = event->ctx;
4157         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
4158         struct pmu *pmu = event->pmu;
4159
4160         /*
4161          * If this is a task context, we need to check whether it is
4162          * the current task context of this cpu.  If not it has been
4163          * scheduled out before the smp call arrived.  In that case
4164          * event->count would have been updated to a recent sample
4165          * when the event was scheduled out.
4166          */
4167         if (ctx->task && cpuctx->task_ctx != ctx)
4168                 return;
4169
4170         raw_spin_lock(&ctx->lock);
4171         if (ctx->is_active & EVENT_TIME) {
4172                 update_context_time(ctx);
4173                 update_cgrp_time_from_event(event);
4174         }
4175
4176         perf_event_update_time(event);
4177         if (data->group)
4178                 perf_event_update_sibling_time(event);
4179
4180         if (event->state != PERF_EVENT_STATE_ACTIVE)
4181                 goto unlock;
4182
4183         if (!data->group) {
4184                 pmu->read(event);
4185                 data->ret = 0;
4186                 goto unlock;
4187         }
4188
4189         pmu->start_txn(pmu, PERF_PMU_TXN_READ);
4190
4191         pmu->read(event);
4192
4193         for_each_sibling_event(sub, event) {
4194                 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
4195                         /*
4196                          * Use sibling's PMU rather than @event's since
4197                          * sibling could be on different (eg: software) PMU.
4198                          */
4199                         sub->pmu->read(sub);
4200                 }
4201         }
4202
4203         data->ret = pmu->commit_txn(pmu);
4204
4205 unlock:
4206         raw_spin_unlock(&ctx->lock);
4207 }
4208
4209 static inline u64 perf_event_count(struct perf_event *event)
4210 {
4211         return local64_read(&event->count) + atomic64_read(&event->child_count);
4212 }
4213
4214 /*
4215  * NMI-safe method to read a local event, that is an event that
4216  * is:
4217  *   - either for the current task, or for this CPU
4218  *   - does not have inherit set, for inherited task events
4219  *     will not be local and we cannot read them atomically
4220  *   - must not have a pmu::count method
4221  */
4222 int perf_event_read_local(struct perf_event *event, u64 *value,
4223                           u64 *enabled, u64 *running)
4224 {
4225         unsigned long flags;
4226         int ret = 0;
4227
4228         /*
4229          * Disabling interrupts avoids all counter scheduling (context
4230          * switches, timer based rotation and IPIs).
4231          */
4232         local_irq_save(flags);
4233
4234         /*
4235          * It must not be an event with inherit set, we cannot read
4236          * all child counters from atomic context.
4237          */
4238         if (event->attr.inherit) {
4239                 ret = -EOPNOTSUPP;
4240                 goto out;
4241         }
4242
4243         /* If this is a per-task event, it must be for current */
4244         if ((event->attach_state & PERF_ATTACH_TASK) &&
4245             event->hw.target != current) {
4246                 ret = -EINVAL;
4247                 goto out;
4248         }
4249
4250         /* If this is a per-CPU event, it must be for this CPU */
4251         if (!(event->attach_state & PERF_ATTACH_TASK) &&
4252             event->cpu != smp_processor_id()) {
4253                 ret = -EINVAL;
4254                 goto out;
4255         }
4256
4257         /* If this is a pinned event it must be running on this CPU */
4258         if (event->attr.pinned && event->oncpu != smp_processor_id()) {
4259                 ret = -EBUSY;
4260                 goto out;
4261         }
4262
4263         /*
4264          * If the event is currently on this CPU, its either a per-task event,
4265          * or local to this CPU. Furthermore it means its ACTIVE (otherwise
4266          * oncpu == -1).
4267          */
4268         if (event->oncpu == smp_processor_id())
4269                 event->pmu->read(event);
4270
4271         *value = local64_read(&event->count);
4272         if (enabled || running) {
4273                 u64 now = event->shadow_ctx_time + perf_clock();
4274                 u64 __enabled, __running;
4275
4276                 __perf_update_times(event, now, &__enabled, &__running);
4277                 if (enabled)
4278                         *enabled = __enabled;
4279                 if (running)
4280                         *running = __running;
4281         }
4282 out:
4283         local_irq_restore(flags);
4284
4285         return ret;
4286 }
4287
4288 static int perf_event_read(struct perf_event *event, bool group)
4289 {
4290         enum perf_event_state state = READ_ONCE(event->state);
4291         int event_cpu, ret = 0;
4292
4293         /*
4294          * If event is enabled and currently active on a CPU, update the
4295          * value in the event structure:
4296          */
4297 again:
4298         if (state == PERF_EVENT_STATE_ACTIVE) {
4299                 struct perf_read_data data;
4300
4301                 /*
4302                  * Orders the ->state and ->oncpu loads such that if we see
4303                  * ACTIVE we must also see the right ->oncpu.
4304                  *
4305                  * Matches the smp_wmb() from event_sched_in().
4306                  */
4307                 smp_rmb();
4308
4309                 event_cpu = READ_ONCE(event->oncpu);
4310                 if ((unsigned)event_cpu >= nr_cpu_ids)
4311                         return 0;
4312
4313                 data = (struct perf_read_data){
4314                         .event = event,
4315                         .group = group,
4316                         .ret = 0,
4317                 };
4318
4319                 preempt_disable();
4320                 event_cpu = __perf_event_read_cpu(event, event_cpu);
4321
4322                 /*
4323                  * Purposely ignore the smp_call_function_single() return
4324                  * value.
4325                  *
4326                  * If event_cpu isn't a valid CPU it means the event got
4327                  * scheduled out and that will have updated the event count.
4328                  *
4329                  * Therefore, either way, we'll have an up-to-date event count
4330                  * after this.
4331                  */
4332                 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4333                 preempt_enable();
4334                 ret = data.ret;
4335
4336         } else if (state == PERF_EVENT_STATE_INACTIVE) {
4337                 struct perf_event_context *ctx = event->ctx;
4338                 unsigned long flags;
4339
4340                 raw_spin_lock_irqsave(&ctx->lock, flags);
4341                 state = event->state;
4342                 if (state != PERF_EVENT_STATE_INACTIVE) {
4343                         raw_spin_unlock_irqrestore(&ctx->lock, flags);
4344                         goto again;
4345                 }
4346
4347                 /*
4348                  * May read while context is not active (e.g., thread is
4349                  * blocked), in that case we cannot update context time
4350                  */
4351                 if (ctx->is_active & EVENT_TIME) {
4352                         update_context_time(ctx);
4353                         update_cgrp_time_from_event(event);
4354                 }
4355
4356                 perf_event_update_time(event);
4357                 if (group)
4358                         perf_event_update_sibling_time(event);
4359                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4360         }
4361
4362         return ret;
4363 }
4364
4365 /*
4366  * Initialize the perf_event context in a task_struct:
4367  */
4368 static void __perf_event_init_context(struct perf_event_context *ctx)
4369 {
4370         raw_spin_lock_init(&ctx->lock);
4371         mutex_init(&ctx->mutex);
4372         INIT_LIST_HEAD(&ctx->active_ctx_list);
4373         perf_event_groups_init(&ctx->pinned_groups);
4374         perf_event_groups_init(&ctx->flexible_groups);
4375         INIT_LIST_HEAD(&ctx->event_list);
4376         INIT_LIST_HEAD(&ctx->pinned_active);
4377         INIT_LIST_HEAD(&ctx->flexible_active);
4378         refcount_set(&ctx->refcount, 1);
4379 }
4380
4381 static struct perf_event_context *
4382 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4383 {
4384         struct perf_event_context *ctx;
4385
4386         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4387         if (!ctx)
4388                 return NULL;
4389
4390         __perf_event_init_context(ctx);
4391         if (task)
4392                 ctx->task = get_task_struct(task);
4393         ctx->pmu = pmu;
4394
4395         return ctx;
4396 }
4397
4398 static struct task_struct *
4399 find_lively_task_by_vpid(pid_t vpid)
4400 {
4401         struct task_struct *task;
4402
4403         rcu_read_lock();
4404         if (!vpid)
4405                 task = current;
4406         else
4407                 task = find_task_by_vpid(vpid);
4408         if (task)
4409                 get_task_struct(task);
4410         rcu_read_unlock();
4411
4412         if (!task)
4413                 return ERR_PTR(-ESRCH);
4414
4415         return task;
4416 }
4417
4418 /*
4419  * Returns a matching context with refcount and pincount.
4420  */
4421 static struct perf_event_context *
4422 find_get_context(struct pmu *pmu, struct task_struct *task,
4423                 struct perf_event *event)
4424 {
4425         struct perf_event_context *ctx, *clone_ctx = NULL;
4426         struct perf_cpu_context *cpuctx;
4427         void *task_ctx_data = NULL;
4428         unsigned long flags;
4429         int ctxn, err;
4430         int cpu = event->cpu;
4431
4432         if (!task) {
4433                 /* Must be root to operate on a CPU event: */
4434                 err = perf_allow_cpu(&event->attr);
4435                 if (err)
4436                         return ERR_PTR(err);
4437
4438                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
4439                 ctx = &cpuctx->ctx;
4440                 get_ctx(ctx);
4441                 ++ctx->pin_count;
4442
4443                 return ctx;
4444         }
4445
4446         err = -EINVAL;
4447         ctxn = pmu->task_ctx_nr;
4448         if (ctxn < 0)
4449                 goto errout;
4450
4451         if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4452                 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
4453                 if (!task_ctx_data) {
4454                         err = -ENOMEM;
4455                         goto errout;
4456                 }
4457         }
4458
4459 retry:
4460         ctx = perf_lock_task_context(task, ctxn, &flags);
4461         if (ctx) {
4462                 clone_ctx = unclone_ctx(ctx);
4463                 ++ctx->pin_count;
4464
4465                 if (task_ctx_data && !ctx->task_ctx_data) {
4466                         ctx->task_ctx_data = task_ctx_data;
4467                         task_ctx_data = NULL;
4468                 }
4469                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4470
4471                 if (clone_ctx)
4472                         put_ctx(clone_ctx);
4473         } else {
4474                 ctx = alloc_perf_context(pmu, task);
4475                 err = -ENOMEM;
4476                 if (!ctx)
4477                         goto errout;
4478
4479                 if (task_ctx_data) {
4480                         ctx->task_ctx_data = task_ctx_data;
4481                         task_ctx_data = NULL;
4482                 }
4483
4484                 err = 0;
4485                 mutex_lock(&task->perf_event_mutex);
4486                 /*
4487                  * If it has already passed perf_event_exit_task().
4488                  * we must see PF_EXITING, it takes this mutex too.
4489                  */
4490                 if (task->flags & PF_EXITING)
4491                         err = -ESRCH;
4492                 else if (task->perf_event_ctxp[ctxn])
4493                         err = -EAGAIN;
4494                 else {
4495                         get_ctx(ctx);
4496                         ++ctx->pin_count;
4497                         rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
4498                 }
4499                 mutex_unlock(&task->perf_event_mutex);
4500
4501                 if (unlikely(err)) {
4502                         put_ctx(ctx);
4503
4504                         if (err == -EAGAIN)
4505                                 goto retry;
4506                         goto errout;
4507                 }
4508         }
4509
4510         kfree(task_ctx_data);
4511         return ctx;
4512
4513 errout:
4514         kfree(task_ctx_data);
4515         return ERR_PTR(err);
4516 }
4517
4518 static void perf_event_free_filter(struct perf_event *event);
4519 static void perf_event_free_bpf_prog(struct perf_event *event);
4520
4521 static void free_event_rcu(struct rcu_head *head)
4522 {
4523         struct perf_event *event;
4524
4525         event = container_of(head, struct perf_event, rcu_head);
4526         if (event->ns)
4527                 put_pid_ns(event->ns);
4528         perf_event_free_filter(event);
4529         kfree(event);
4530 }
4531
4532 static void ring_buffer_attach(struct perf_event *event,
4533                                struct perf_buffer *rb);
4534
4535 static void detach_sb_event(struct perf_event *event)
4536 {
4537         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4538
4539         raw_spin_lock(&pel->lock);
4540         list_del_rcu(&event->sb_list);
4541         raw_spin_unlock(&pel->lock);
4542 }
4543
4544 static bool is_sb_event(struct perf_event *event)
4545 {
4546         struct perf_event_attr *attr = &event->attr;
4547
4548         if (event->parent)
4549                 return false;
4550
4551         if (event->attach_state & PERF_ATTACH_TASK)
4552                 return false;
4553
4554         if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4555             attr->comm || attr->comm_exec ||
4556             attr->task || attr->ksymbol ||
4557             attr->context_switch ||
4558             attr->bpf_event)
4559                 return true;
4560         return false;
4561 }
4562
4563 static void unaccount_pmu_sb_event(struct perf_event *event)
4564 {
4565         if (is_sb_event(event))
4566                 detach_sb_event(event);
4567 }
4568
4569 static void unaccount_event_cpu(struct perf_event *event, int cpu)
4570 {
4571         if (event->parent)
4572                 return;
4573
4574         if (is_cgroup_event(event))
4575                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4576 }
4577
4578 #ifdef CONFIG_NO_HZ_FULL
4579 static DEFINE_SPINLOCK(nr_freq_lock);
4580 #endif
4581
4582 static void unaccount_freq_event_nohz(void)
4583 {
4584 #ifdef CONFIG_NO_HZ_FULL
4585         spin_lock(&nr_freq_lock);
4586         if (atomic_dec_and_test(&nr_freq_events))
4587                 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4588         spin_unlock(&nr_freq_lock);
4589 #endif
4590 }
4591
4592 static void unaccount_freq_event(void)
4593 {
4594         if (tick_nohz_full_enabled())
4595                 unaccount_freq_event_nohz();
4596         else
4597                 atomic_dec(&nr_freq_events);
4598 }
4599
4600 static void unaccount_event(struct perf_event *event)
4601 {
4602         bool dec = false;
4603
4604         if (event->parent)
4605                 return;
4606
4607         if (event->attach_state & PERF_ATTACH_TASK)
4608                 dec = true;
4609         if (event->attr.mmap || event->attr.mmap_data)
4610                 atomic_dec(&nr_mmap_events);
4611         if (event->attr.comm)
4612                 atomic_dec(&nr_comm_events);
4613         if (event->attr.namespaces)
4614                 atomic_dec(&nr_namespaces_events);
4615         if (event->attr.cgroup)
4616                 atomic_dec(&nr_cgroup_events);
4617         if (event->attr.task)
4618                 atomic_dec(&nr_task_events);
4619         if (event->attr.freq)
4620                 unaccount_freq_event();
4621         if (event->attr.context_switch) {
4622                 dec = true;
4623                 atomic_dec(&nr_switch_events);
4624         }
4625         if (is_cgroup_event(event))
4626                 dec = true;
4627         if (has_branch_stack(event))
4628                 dec = true;
4629         if (event->attr.ksymbol)
4630                 atomic_dec(&nr_ksymbol_events);
4631         if (event->attr.bpf_event)
4632                 atomic_dec(&nr_bpf_events);
4633
4634         if (dec) {
4635                 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4636                         schedule_delayed_work(&perf_sched_work, HZ);
4637         }
4638
4639         unaccount_event_cpu(event, event->cpu);
4640
4641         unaccount_pmu_sb_event(event);
4642 }
4643
4644 static void perf_sched_delayed(struct work_struct *work)
4645 {
4646         mutex_lock(&perf_sched_mutex);
4647         if (atomic_dec_and_test(&perf_sched_count))
4648                 static_branch_disable(&perf_sched_events);
4649         mutex_unlock(&perf_sched_mutex);
4650 }
4651
4652 /*
4653  * The following implement mutual exclusion of events on "exclusive" pmus
4654  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
4655  * at a time, so we disallow creating events that might conflict, namely:
4656  *
4657  *  1) cpu-wide events in the presence of per-task events,
4658  *  2) per-task events in the presence of cpu-wide events,
4659  *  3) two matching events on the same context.
4660  *
4661  * The former two cases are handled in the allocation path (perf_event_alloc(),
4662  * _free_event()), the latter -- before the first perf_install_in_context().
4663  */
4664 static int exclusive_event_init(struct perf_event *event)
4665 {
4666         struct pmu *pmu = event->pmu;
4667
4668         if (!is_exclusive_pmu(pmu))
4669                 return 0;
4670
4671         /*
4672          * Prevent co-existence of per-task and cpu-wide events on the
4673          * same exclusive pmu.
4674          *
4675          * Negative pmu::exclusive_cnt means there are cpu-wide
4676          * events on this "exclusive" pmu, positive means there are
4677          * per-task events.
4678          *
4679          * Since this is called in perf_event_alloc() path, event::ctx
4680          * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
4681          * to mean "per-task event", because unlike other attach states it
4682          * never gets cleared.
4683          */
4684         if (event->attach_state & PERF_ATTACH_TASK) {
4685                 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4686                         return -EBUSY;
4687         } else {
4688                 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4689                         return -EBUSY;
4690         }
4691
4692         return 0;
4693 }
4694
4695 static void exclusive_event_destroy(struct perf_event *event)
4696 {
4697         struct pmu *pmu = event->pmu;
4698
4699         if (!is_exclusive_pmu(pmu))
4700                 return;
4701
4702         /* see comment in exclusive_event_init() */
4703         if (event->attach_state & PERF_ATTACH_TASK)
4704                 atomic_dec(&pmu->exclusive_cnt);
4705         else
4706                 atomic_inc(&pmu->exclusive_cnt);
4707 }
4708
4709 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4710 {
4711         if ((e1->pmu == e2->pmu) &&
4712             (e1->cpu == e2->cpu ||
4713              e1->cpu == -1 ||
4714              e2->cpu == -1))
4715                 return true;
4716         return false;
4717 }
4718
4719 static bool exclusive_event_installable(struct perf_event *event,
4720                                         struct perf_event_context *ctx)
4721 {
4722         struct perf_event *iter_event;
4723         struct pmu *pmu = event->pmu;
4724
4725         lockdep_assert_held(&ctx->mutex);
4726
4727         if (!is_exclusive_pmu(pmu))
4728                 return true;
4729
4730         list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4731                 if (exclusive_event_match(iter_event, event))
4732                         return false;
4733         }
4734
4735         return true;
4736 }
4737
4738 static void perf_addr_filters_splice(struct perf_event *event,
4739                                        struct list_head *head);
4740
4741 static void _free_event(struct perf_event *event)
4742 {
4743         irq_work_sync(&event->pending);
4744
4745         unaccount_event(event);
4746
4747         security_perf_event_free(event);
4748
4749         if (event->rb) {
4750                 /*
4751                  * Can happen when we close an event with re-directed output.
4752                  *
4753                  * Since we have a 0 refcount, perf_mmap_close() will skip
4754                  * over us; possibly making our ring_buffer_put() the last.
4755                  */
4756                 mutex_lock(&event->mmap_mutex);
4757                 ring_buffer_attach(event, NULL);
4758                 mutex_unlock(&event->mmap_mutex);
4759         }
4760
4761         if (is_cgroup_event(event))
4762                 perf_detach_cgroup(event);
4763
4764         if (!event->parent) {
4765                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4766                         put_callchain_buffers();
4767         }
4768
4769         perf_event_free_bpf_prog(event);
4770         perf_addr_filters_splice(event, NULL);
4771         kfree(event->addr_filter_ranges);
4772
4773         if (event->destroy)
4774                 event->destroy(event);
4775
4776         /*
4777          * Must be after ->destroy(), due to uprobe_perf_close() using
4778          * hw.target.
4779          */
4780         if (event->hw.target)
4781                 put_task_struct(event->hw.target);
4782
4783         /*
4784          * perf_event_free_task() relies on put_ctx() being 'last', in particular
4785          * all task references must be cleaned up.
4786          */
4787         if (event->ctx)
4788                 put_ctx(event->ctx);
4789
4790         exclusive_event_destroy(event);
4791         module_put(event->pmu->module);
4792
4793         call_rcu(&event->rcu_head, free_event_rcu);
4794 }
4795
4796 /*
4797  * Used to free events which have a known refcount of 1, such as in error paths
4798  * where the event isn't exposed yet and inherited events.
4799  */
4800 static void free_event(struct perf_event *event)
4801 {
4802         if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4803                                 "unexpected event refcount: %ld; ptr=%p\n",
4804                                 atomic_long_read(&event->refcount), event)) {
4805                 /* leak to avoid use-after-free */
4806                 return;
4807         }
4808
4809         _free_event(event);
4810 }
4811
4812 /*
4813  * Remove user event from the owner task.
4814  */
4815 static void perf_remove_from_owner(struct perf_event *event)
4816 {
4817         struct task_struct *owner;
4818
4819         rcu_read_lock();
4820         /*
4821          * Matches the smp_store_release() in perf_event_exit_task(). If we
4822          * observe !owner it means the list deletion is complete and we can
4823          * indeed free this event, otherwise we need to serialize on
4824          * owner->perf_event_mutex.
4825          */
4826         owner = READ_ONCE(event->owner);
4827         if (owner) {
4828                 /*
4829                  * Since delayed_put_task_struct() also drops the last
4830                  * task reference we can safely take a new reference
4831                  * while holding the rcu_read_lock().
4832                  */
4833                 get_task_struct(owner);
4834         }
4835         rcu_read_unlock();
4836
4837         if (owner) {
4838                 /*
4839                  * If we're here through perf_event_exit_task() we're already
4840                  * holding ctx->mutex which would be an inversion wrt. the
4841                  * normal lock order.
4842                  *
4843                  * However we can safely take this lock because its the child
4844                  * ctx->mutex.
4845                  */
4846                 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4847
4848                 /*
4849                  * We have to re-check the event->owner field, if it is cleared
4850                  * we raced with perf_event_exit_task(), acquiring the mutex
4851                  * ensured they're done, and we can proceed with freeing the
4852                  * event.
4853                  */
4854                 if (event->owner) {
4855                         list_del_init(&event->owner_entry);
4856                         smp_store_release(&event->owner, NULL);
4857                 }
4858                 mutex_unlock(&owner->perf_event_mutex);
4859                 put_task_struct(owner);
4860         }
4861 }
4862
4863 static void put_event(struct perf_event *event)
4864 {
4865         if (!atomic_long_dec_and_test(&event->refcount))
4866                 return;
4867
4868         _free_event(event);
4869 }
4870
4871 /*
4872  * Kill an event dead; while event:refcount will preserve the event
4873  * object, it will not preserve its functionality. Once the last 'user'
4874  * gives up the object, we'll destroy the thing.
4875  */
4876 int perf_event_release_kernel(struct perf_event *event)
4877 {
4878         struct perf_event_context *ctx = event->ctx;
4879         struct perf_event *child, *tmp;
4880         LIST_HEAD(free_list);
4881
4882         /*
4883          * If we got here through err_file: fput(event_file); we will not have
4884          * attached to a context yet.
4885          */
4886         if (!ctx) {
4887                 WARN_ON_ONCE(event->attach_state &
4888                                 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4889                 goto no_ctx;
4890         }
4891
4892         if (!is_kernel_event(event))
4893                 perf_remove_from_owner(event);
4894
4895         ctx = perf_event_ctx_lock(event);
4896         WARN_ON_ONCE(ctx->parent_ctx);
4897         perf_remove_from_context(event, DETACH_GROUP);
4898
4899         raw_spin_lock_irq(&ctx->lock);
4900         /*
4901          * Mark this event as STATE_DEAD, there is no external reference to it
4902          * anymore.
4903          *
4904          * Anybody acquiring event->child_mutex after the below loop _must_
4905          * also see this, most importantly inherit_event() which will avoid
4906          * placing more children on the list.
4907          *
4908          * Thus this guarantees that we will in fact observe and kill _ALL_
4909          * child events.
4910          */
4911         event->state = PERF_EVENT_STATE_DEAD;
4912         raw_spin_unlock_irq(&ctx->lock);
4913
4914         perf_event_ctx_unlock(event, ctx);
4915
4916 again:
4917         mutex_lock(&event->child_mutex);
4918         list_for_each_entry(child, &event->child_list, child_list) {
4919
4920                 /*
4921                  * Cannot change, child events are not migrated, see the
4922                  * comment with perf_event_ctx_lock_nested().
4923                  */
4924                 ctx = READ_ONCE(child->ctx);
4925                 /*
4926                  * Since child_mutex nests inside ctx::mutex, we must jump
4927                  * through hoops. We start by grabbing a reference on the ctx.
4928                  *
4929                  * Since the event cannot get freed while we hold the
4930                  * child_mutex, the context must also exist and have a !0
4931                  * reference count.
4932                  */
4933                 get_ctx(ctx);
4934
4935                 /*
4936                  * Now that we have a ctx ref, we can drop child_mutex, and
4937                  * acquire ctx::mutex without fear of it going away. Then we
4938                  * can re-acquire child_mutex.
4939                  */
4940                 mutex_unlock(&event->child_mutex);
4941                 mutex_lock(&ctx->mutex);
4942                 mutex_lock(&event->child_mutex);
4943
4944                 /*
4945                  * Now that we hold ctx::mutex and child_mutex, revalidate our
4946                  * state, if child is still the first entry, it didn't get freed
4947                  * and we can continue doing so.
4948                  */
4949                 tmp = list_first_entry_or_null(&event->child_list,
4950                                                struct perf_event, child_list);
4951                 if (tmp == child) {
4952                         perf_remove_from_context(child, DETACH_GROUP);
4953                         list_move(&child->child_list, &free_list);
4954                         /*
4955                          * This matches the refcount bump in inherit_event();
4956                          * this can't be the last reference.
4957                          */
4958                         put_event(event);
4959                 }
4960
4961                 mutex_unlock(&event->child_mutex);
4962                 mutex_unlock(&ctx->mutex);
4963                 put_ctx(ctx);
4964                 goto again;
4965         }
4966         mutex_unlock(&event->child_mutex);
4967
4968         list_for_each_entry_safe(child, tmp, &free_list, child_list) {
4969                 void *var = &child->ctx->refcount;
4970
4971                 list_del(&child->child_list);
4972                 free_event(child);
4973
4974                 /*
4975                  * Wake any perf_event_free_task() waiting for this event to be
4976                  * freed.
4977                  */
4978                 smp_mb(); /* pairs with wait_var_event() */
4979                 wake_up_var(var);
4980         }
4981
4982 no_ctx:
4983         put_event(event); /* Must be the 'last' reference */
4984         return 0;
4985 }
4986 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
4987
4988 /*
4989  * Called when the last reference to the file is gone.
4990  */
4991 static int perf_release(struct inode *inode, struct file *file)
4992 {
4993         perf_event_release_kernel(file->private_data);
4994         return 0;
4995 }
4996
4997 static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4998 {
4999         struct perf_event *child;
5000         u64 total = 0;
5001
5002         *enabled = 0;
5003         *running = 0;
5004
5005         mutex_lock(&event->child_mutex);
5006
5007         (void)perf_event_read(event, false);
5008         total += perf_event_count(event);
5009
5010         *enabled += event->total_time_enabled +
5011                         atomic64_read(&event->child_total_time_enabled);
5012         *running += event->total_time_running +
5013                         atomic64_read(&event->child_total_time_running);
5014
5015         list_for_each_entry(child, &event->child_list, child_list) {
5016                 (void)perf_event_read(child, false);
5017                 total += perf_event_count(child);
5018                 *enabled += child->total_time_enabled;
5019                 *running += child->total_time_running;
5020         }
5021         mutex_unlock(&event->child_mutex);
5022
5023         return total;
5024 }
5025
5026 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5027 {
5028         struct perf_event_context *ctx;
5029         u64 count;
5030
5031         ctx = perf_event_ctx_lock(event);
5032         count = __perf_event_read_value(event, enabled, running);
5033         perf_event_ctx_unlock(event, ctx);
5034
5035         return count;
5036 }
5037 EXPORT_SYMBOL_GPL(perf_event_read_value);
5038
5039 static int __perf_read_group_add(struct perf_event *leader,
5040                                         u64 read_format, u64 *values)
5041 {
5042         struct perf_event_context *ctx = leader->ctx;
5043         struct perf_event *sub;
5044         unsigned long flags;
5045         int n = 1; /* skip @nr */
5046         int ret;
5047
5048         ret = perf_event_read(leader, true);
5049         if (ret)
5050                 return ret;
5051
5052         raw_spin_lock_irqsave(&ctx->lock, flags);
5053
5054         /*
5055          * Since we co-schedule groups, {enabled,running} times of siblings
5056          * will be identical to those of the leader, so we only publish one
5057          * set.
5058          */
5059         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5060                 values[n++] += leader->total_time_enabled +
5061                         atomic64_read(&leader->child_total_time_enabled);
5062         }
5063
5064         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5065                 values[n++] += leader->total_time_running +
5066                         atomic64_read(&leader->child_total_time_running);
5067         }
5068
5069         /*
5070          * Write {count,id} tuples for every sibling.
5071          */
5072         values[n++] += perf_event_count(leader);
5073         if (read_format & PERF_FORMAT_ID)
5074                 values[n++] = primary_event_id(leader);
5075
5076         for_each_sibling_event(sub, leader) {
5077                 values[n++] += perf_event_count(sub);
5078                 if (read_format & PERF_FORMAT_ID)
5079                         values[n++] = primary_event_id(sub);
5080         }
5081
5082         raw_spin_unlock_irqrestore(&ctx->lock, flags);
5083         return 0;
5084 }
5085
5086 static int perf_read_group(struct perf_event *event,
5087                                    u64 read_format, char __user *buf)
5088 {
5089         struct perf_event *leader = event->group_leader, *child;
5090         struct perf_event_context *ctx = leader->ctx;
5091         int ret;
5092         u64 *values;
5093
5094         lockdep_assert_held(&ctx->mutex);
5095
5096         values = kzalloc(event->read_size, GFP_KERNEL);
5097         if (!values)
5098                 return -ENOMEM;
5099
5100         values[0] = 1 + leader->nr_siblings;
5101
5102         /*
5103          * By locking the child_mutex of the leader we effectively
5104          * lock the child list of all siblings.. XXX explain how.
5105          */
5106         mutex_lock(&leader->child_mutex);
5107
5108         ret = __perf_read_group_add(leader, read_format, values);
5109         if (ret)
5110                 goto unlock;
5111
5112         list_for_each_entry(child, &leader->child_list, child_list) {
5113                 ret = __perf_read_group_add(child, read_format, values);
5114                 if (ret)
5115                         goto unlock;
5116         }
5117
5118         mutex_unlock(&leader->child_mutex);
5119
5120         ret = event->read_size;
5121         if (copy_to_user(buf, values, event->read_size))
5122                 ret = -EFAULT;
5123         goto out;
5124
5125 unlock:
5126         mutex_unlock(&leader->child_mutex);
5127 out:
5128         kfree(values);
5129         return ret;
5130 }
5131
5132 static int perf_read_one(struct perf_event *event,
5133                                  u64 read_format, char __user *buf)
5134 {
5135         u64 enabled, running;
5136         u64 values[4];
5137         int n = 0;
5138
5139         values[n++] = __perf_event_read_value(event, &enabled, &running);
5140         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5141                 values[n++] = enabled;
5142         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5143                 values[n++] = running;
5144         if (read_format & PERF_FORMAT_ID)
5145                 values[n++] = primary_event_id(event);
5146
5147         if (copy_to_user(buf, values, n * sizeof(u64)))
5148                 return -EFAULT;
5149
5150         return n * sizeof(u64);
5151 }
5152
5153 static bool is_event_hup(struct perf_event *event)
5154 {
5155         bool no_children;
5156
5157         if (event->state > PERF_EVENT_STATE_EXIT)
5158                 return false;
5159
5160         mutex_lock(&event->child_mutex);
5161         no_children = list_empty(&event->child_list);
5162         mutex_unlock(&event->child_mutex);
5163         return no_children;
5164 }
5165
5166 /*
5167  * Read the performance event - simple non blocking version for now
5168  */
5169 static ssize_t
5170 __perf_read(struct perf_event *event, char __user *buf, size_t count)
5171 {
5172         u64 read_format = event->attr.read_format;
5173         int ret;
5174
5175         /*
5176          * Return end-of-file for a read on an event that is in
5177          * error state (i.e. because it was pinned but it couldn't be
5178          * scheduled on to the CPU at some point).
5179          */
5180         if (event->state == PERF_EVENT_STATE_ERROR)
5181                 return 0;
5182
5183         if (count < event->read_size)
5184                 return -ENOSPC;
5185
5186         WARN_ON_ONCE(event->ctx->parent_ctx);
5187         if (read_format & PERF_FORMAT_GROUP)
5188                 ret = perf_read_group(event, read_format, buf);
5189         else
5190                 ret = perf_read_one(event, read_format, buf);
5191
5192         return ret;
5193 }
5194
5195 static ssize_t
5196 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
5197 {
5198         struct perf_event *event = file->private_data;
5199         struct perf_event_context *ctx;
5200         int ret;
5201
5202         ret = security_perf_event_read(event);
5203         if (ret)
5204                 return ret;
5205
5206         ctx = perf_event_ctx_lock(event);
5207         ret = __perf_read(event, buf, count);
5208         perf_event_ctx_unlock(event, ctx);
5209
5210         return ret;
5211 }
5212
5213 static __poll_t perf_poll(struct file *file, poll_table *wait)
5214 {
5215         struct perf_event *event = file->private_data;
5216         struct perf_buffer *rb;
5217         __poll_t events = EPOLLHUP;
5218
5219         poll_wait(file, &event->waitq, wait);
5220
5221         if (is_event_hup(event))
5222                 return events;
5223
5224         /*
5225          * Pin the event->rb by taking event->mmap_mutex; otherwise
5226          * perf_event_set_output() can swizzle our rb and make us miss wakeups.
5227          */
5228         mutex_lock(&event->mmap_mutex);
5229         rb = event->rb;
5230         if (rb)
5231                 events = atomic_xchg(&rb->poll, 0);
5232         mutex_unlock(&event->mmap_mutex);
5233         return events;
5234 }
5235
5236 static void _perf_event_reset(struct perf_event *event)
5237 {
5238         (void)perf_event_read(event, false);
5239         local64_set(&event->count, 0);
5240         perf_event_update_userpage(event);
5241 }
5242
5243 /* Assume it's not an event with inherit set. */
5244 u64 perf_event_pause(struct perf_event *event, bool reset)
5245 {
5246         struct perf_event_context *ctx;
5247         u64 count;
5248
5249         ctx = perf_event_ctx_lock(event);
5250         WARN_ON_ONCE(event->attr.inherit);
5251         _perf_event_disable(event);
5252         count = local64_read(&event->count);
5253         if (reset)
5254                 local64_set(&event->count, 0);
5255         perf_event_ctx_unlock(event, ctx);
5256
5257         return count;
5258 }
5259 EXPORT_SYMBOL_GPL(perf_event_pause);
5260
5261 /*
5262  * Holding the top-level event's child_mutex means that any
5263  * descendant process that has inherited this event will block
5264  * in perf_event_exit_event() if it goes to exit, thus satisfying the
5265  * task existence requirements of perf_event_enable/disable.
5266  */
5267 static void perf_event_for_each_child(struct perf_event *event,
5268                                         void (*func)(struct perf_event *))
5269 {
5270         struct perf_event *child;
5271
5272         WARN_ON_ONCE(event->ctx->parent_ctx);
5273
5274         mutex_lock(&event->child_mutex);
5275         func(event);
5276         list_for_each_entry(child, &event->child_list, child_list)
5277                 func(child);
5278         mutex_unlock(&event->child_mutex);
5279 }
5280
5281 static void perf_event_for_each(struct perf_event *event,
5282                                   void (*func)(struct perf_event *))
5283 {
5284         struct perf_event_context *ctx = event->ctx;
5285         struct perf_event *sibling;
5286
5287         lockdep_assert_held(&ctx->mutex);
5288
5289         event = event->group_leader;
5290
5291         perf_event_for_each_child(event, func);
5292         for_each_sibling_event(sibling, event)
5293                 perf_event_for_each_child(sibling, func);
5294 }
5295
5296 static void __perf_event_period(struct perf_event *event,
5297                                 struct perf_cpu_context *cpuctx,
5298                                 struct perf_event_context *ctx,
5299                                 void *info)
5300 {
5301         u64 value = *((u64 *)info);
5302         bool active;
5303
5304         if (event->attr.freq) {
5305                 event->attr.sample_freq = value;
5306         } else {
5307                 event->attr.sample_period = value;
5308                 event->hw.sample_period = value;
5309         }
5310
5311         active = (event->state == PERF_EVENT_STATE_ACTIVE);
5312         if (active) {
5313                 perf_pmu_disable(ctx->pmu);
5314                 /*
5315                  * We could be throttled; unthrottle now to avoid the tick
5316                  * trying to unthrottle while we already re-started the event.
5317                  */
5318                 if (event->hw.interrupts == MAX_INTERRUPTS) {
5319                         event->hw.interrupts = 0;
5320                         perf_log_throttle(event, 1);
5321                 }
5322                 event->pmu->stop(event, PERF_EF_UPDATE);
5323         }
5324
5325         local64_set(&event->hw.period_left, 0);
5326
5327         if (active) {
5328                 event->pmu->start(event, PERF_EF_RELOAD);
5329                 perf_pmu_enable(ctx->pmu);
5330         }
5331 }
5332
5333 static int perf_event_check_period(struct perf_event *event, u64 value)
5334 {
5335         return event->pmu->check_period(event, value);
5336 }
5337
5338 static int _perf_event_period(struct perf_event *event, u64 value)
5339 {
5340         if (!is_sampling_event(event))
5341                 return -EINVAL;
5342
5343         if (!value)
5344                 return -EINVAL;
5345
5346         if (event->attr.freq && value > sysctl_perf_event_sample_rate)
5347                 return -EINVAL;
5348
5349         if (perf_event_check_period(event, value))
5350                 return -EINVAL;
5351
5352         if (!event->attr.freq && (value & (1ULL << 63)))
5353                 return -EINVAL;
5354
5355         event_function_call(event, __perf_event_period, &value);
5356
5357         return 0;
5358 }
5359
5360 int perf_event_period(struct perf_event *event, u64 value)
5361 {
5362         struct perf_event_context *ctx;
5363         int ret;
5364
5365         ctx = perf_event_ctx_lock(event);
5366         ret = _perf_event_period(event, value);
5367         perf_event_ctx_unlock(event, ctx);
5368
5369         return ret;
5370 }
5371 EXPORT_SYMBOL_GPL(perf_event_period);
5372
5373 static const struct file_operations perf_fops;
5374
5375 static inline int perf_fget_light(int fd, struct fd *p)
5376 {
5377         struct fd f = fdget(fd);
5378         if (!f.file)
5379                 return -EBADF;
5380
5381         if (f.file->f_op != &perf_fops) {
5382                 fdput(f);
5383                 return -EBADF;
5384         }
5385         *p = f;
5386         return 0;
5387 }
5388
5389 static int perf_event_set_output(struct perf_event *event,
5390                                  struct perf_event *output_event);
5391 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5392 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
5393 static int perf_copy_attr(struct perf_event_attr __user *uattr,
5394                           struct perf_event_attr *attr);
5395
5396 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5397 {
5398         void (*func)(struct perf_event *);
5399         u32 flags = arg;
5400
5401         switch (cmd) {
5402         case PERF_EVENT_IOC_ENABLE:
5403                 func = _perf_event_enable;
5404                 break;
5405         case PERF_EVENT_IOC_DISABLE:
5406                 func = _perf_event_disable;
5407                 break;
5408         case PERF_EVENT_IOC_RESET:
5409                 func = _perf_event_reset;
5410                 break;
5411
5412         case PERF_EVENT_IOC_REFRESH:
5413                 return _perf_event_refresh(event, arg);
5414
5415         case PERF_EVENT_IOC_PERIOD:
5416         {
5417                 u64 value;
5418
5419                 if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
5420                         return -EFAULT;
5421
5422                 return _perf_event_period(event, value);
5423         }
5424         case PERF_EVENT_IOC_ID:
5425         {
5426                 u64 id = primary_event_id(event);
5427
5428                 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5429                         return -EFAULT;
5430                 return 0;
5431         }
5432
5433         case PERF_EVENT_IOC_SET_OUTPUT:
5434         {
5435                 int ret;
5436                 if (arg != -1) {
5437                         struct perf_event *output_event;
5438                         struct fd output;
5439                         ret = perf_fget_light(arg, &output);
5440                         if (ret)
5441                                 return ret;
5442                         output_event = output.file->private_data;
5443                         ret = perf_event_set_output(event, output_event);
5444                         fdput(output);
5445                 } else {
5446                         ret = perf_event_set_output(event, NULL);
5447                 }
5448                 return ret;
5449         }
5450
5451         case PERF_EVENT_IOC_SET_FILTER:
5452                 return perf_event_set_filter(event, (void __user *)arg);
5453
5454         case PERF_EVENT_IOC_SET_BPF:
5455                 return perf_event_set_bpf_prog(event, arg);
5456
5457         case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5458                 struct perf_buffer *rb;
5459
5460                 rcu_read_lock();
5461                 rb = rcu_dereference(event->rb);
5462                 if (!rb || !rb->nr_pages) {
5463                         rcu_read_unlock();
5464                         return -EINVAL;
5465                 }
5466                 rb_toggle_paused(rb, !!arg);
5467                 rcu_read_unlock();
5468                 return 0;
5469         }
5470
5471         case PERF_EVENT_IOC_QUERY_BPF:
5472                 return perf_event_query_prog_array(event, (void __user *)arg);
5473
5474         case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5475                 struct perf_event_attr new_attr;
5476                 int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5477                                          &new_attr);
5478
5479                 if (err)
5480                         return err;
5481
5482                 return perf_event_modify_attr(event,  &new_attr);
5483         }
5484         default:
5485                 return -ENOTTY;
5486         }
5487
5488         if (flags & PERF_IOC_FLAG_GROUP)
5489                 perf_event_for_each(event, func);
5490         else
5491                 perf_event_for_each_child(event, func);
5492
5493         return 0;
5494 }
5495
5496 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5497 {
5498         struct perf_event *event = file->private_data;
5499         struct perf_event_context *ctx;
5500         long ret;
5501
5502         /* Treat ioctl like writes as it is likely a mutating operation. */
5503         ret = security_perf_event_write(event);
5504         if (ret)
5505                 return ret;
5506
5507         ctx = perf_event_ctx_lock(event);
5508         ret = _perf_ioctl(event, cmd, arg);
5509         perf_event_ctx_unlock(event, ctx);
5510
5511         return ret;
5512 }
5513
5514 #ifdef CONFIG_COMPAT
5515 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5516                                 unsigned long arg)
5517 {
5518         switch (_IOC_NR(cmd)) {
5519         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5520         case _IOC_NR(PERF_EVENT_IOC_ID):
5521         case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
5522         case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
5523                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
5524                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5525                         cmd &= ~IOCSIZE_MASK;
5526                         cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5527                 }
5528                 break;
5529         }
5530         return perf_ioctl(file, cmd, arg);
5531 }
5532 #else
5533 # define perf_compat_ioctl NULL
5534 #endif
5535
5536 int perf_event_task_enable(void)
5537 {
5538         struct perf_event_context *ctx;
5539         struct perf_event *event;
5540
5541         mutex_lock(&current->perf_event_mutex);
5542         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5543                 ctx = perf_event_ctx_lock(event);
5544                 perf_event_for_each_child(event, _perf_event_enable);
5545                 perf_event_ctx_unlock(event, ctx);
5546         }
5547         mutex_unlock(&current->perf_event_mutex);
5548
5549         return 0;
5550 }
5551
5552 int perf_event_task_disable(void)
5553 {
5554         struct perf_event_context *ctx;
5555         struct perf_event *event;
5556
5557         mutex_lock(&current->perf_event_mutex);
5558         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5559                 ctx = perf_event_ctx_lock(event);
5560                 perf_event_for_each_child(event, _perf_event_disable);
5561                 perf_event_ctx_unlock(event, ctx);
5562         }
5563         mutex_unlock(&current->perf_event_mutex);
5564
5565         return 0;
5566 }
5567
5568 static int perf_event_index(struct perf_event *event)
5569 {
5570         if (event->hw.state & PERF_HES_STOPPED)
5571                 return 0;
5572
5573         if (event->state != PERF_EVENT_STATE_ACTIVE)
5574                 return 0;
5575
5576         return event->pmu->event_idx(event);
5577 }
5578
5579 static void calc_timer_values(struct perf_event *event,
5580                                 u64 *now,
5581                                 u64 *enabled,
5582                                 u64 *running)
5583 {
5584         u64 ctx_time;
5585
5586         *now = perf_clock();
5587         ctx_time = event->shadow_ctx_time + *now;
5588         __perf_update_times(event, ctx_time, enabled, running);
5589 }
5590
5591 static void perf_event_init_userpage(struct perf_event *event)
5592 {
5593         struct perf_event_mmap_page *userpg;
5594         struct perf_buffer *rb;
5595
5596         rcu_read_lock();
5597         rb = rcu_dereference(event->rb);
5598         if (!rb)
5599                 goto unlock;
5600
5601         userpg = rb->user_page;
5602
5603         /* Allow new userspace to detect that bit 0 is deprecated */
5604         userpg->cap_bit0_is_deprecated = 1;
5605         userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
5606         userpg->data_offset = PAGE_SIZE;
5607         userpg->data_size = perf_data_size(rb);
5608
5609 unlock:
5610         rcu_read_unlock();
5611 }
5612
5613 void __weak arch_perf_update_userpage(
5614         struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5615 {
5616 }
5617
5618 /*
5619  * Callers need to ensure there can be no nesting of this function, otherwise
5620  * the seqlock logic goes bad. We can not serialize this because the arch
5621  * code calls this from NMI context.
5622  */
5623 void perf_event_update_userpage(struct perf_event *event)
5624 {
5625         struct perf_event_mmap_page *userpg;
5626         struct perf_buffer *rb;
5627         u64 enabled, running, now;
5628
5629         rcu_read_lock();
5630         rb = rcu_dereference(event->rb);
5631         if (!rb)
5632                 goto unlock;
5633
5634         /*
5635          * compute total_time_enabled, total_time_running
5636          * based on snapshot values taken when the event
5637          * was last scheduled in.
5638          *
5639          * we cannot simply called update_context_time()
5640          * because of locking issue as we can be called in
5641          * NMI context
5642          */
5643         calc_timer_values(event, &now, &enabled, &running);
5644
5645         userpg = rb->user_page;
5646         /*
5647          * Disable preemption to guarantee consistent time stamps are stored to
5648          * the user page.
5649          */
5650         preempt_disable();
5651         ++userpg->lock;
5652         barrier();
5653         userpg->index = perf_event_index(event);
5654         userpg->offset = perf_event_count(event);
5655         if (userpg->index)
5656                 userpg->offset -= local64_read(&event->hw.prev_count);
5657
5658         userpg->time_enabled = enabled +
5659                         atomic64_read(&event->child_total_time_enabled);
5660
5661         userpg->time_running = running +
5662                         atomic64_read(&event->child_total_time_running);
5663
5664         arch_perf_update_userpage(event, userpg, now);
5665
5666         barrier();
5667         ++userpg->lock;
5668         preempt_enable();
5669 unlock:
5670         rcu_read_unlock();
5671 }
5672 EXPORT_SYMBOL_GPL(perf_event_update_userpage);
5673
5674 static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5675 {
5676         struct perf_event *event = vmf->vma->vm_file->private_data;
5677         struct perf_buffer *rb;
5678         vm_fault_t ret = VM_FAULT_SIGBUS;
5679
5680         if (vmf->flags & FAULT_FLAG_MKWRITE) {
5681                 if (vmf->pgoff == 0)
5682                         ret = 0;
5683                 return ret;
5684         }
5685
5686         rcu_read_lock();
5687         rb = rcu_dereference(event->rb);
5688         if (!rb)
5689                 goto unlock;
5690
5691         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5692                 goto unlock;
5693
5694         vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5695         if (!vmf->page)
5696                 goto unlock;
5697
5698         get_page(vmf->page);
5699         vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5700         vmf->page->index   = vmf->pgoff;
5701
5702         ret = 0;
5703 unlock:
5704         rcu_read_unlock();
5705
5706         return ret;
5707 }
5708
5709 static void ring_buffer_attach(struct perf_event *event,
5710                                struct perf_buffer *rb)
5711 {
5712         struct perf_buffer *old_rb = NULL;
5713         unsigned long flags;
5714
5715         if (event->rb) {
5716                 /*
5717                  * Should be impossible, we set this when removing
5718                  * event->rb_entry and wait/clear when adding event->rb_entry.
5719                  */
5720                 WARN_ON_ONCE(event->rcu_pending);
5721
5722                 old_rb = event->rb;
5723                 spin_lock_irqsave(&old_rb->event_lock, flags);
5724                 list_del_rcu(&event->rb_entry);
5725                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
5726
5727                 event->rcu_batches = get_state_synchronize_rcu();
5728                 event->rcu_pending = 1;
5729         }
5730
5731         if (rb) {
5732                 if (event->rcu_pending) {
5733                         cond_synchronize_rcu(event->rcu_batches);
5734                         event->rcu_pending = 0;
5735                 }
5736
5737                 spin_lock_irqsave(&rb->event_lock, flags);
5738                 list_add_rcu(&event->rb_entry, &rb->event_list);
5739                 spin_unlock_irqrestore(&rb->event_lock, flags);
5740         }
5741
5742         /*
5743          * Avoid racing with perf_mmap_close(AUX): stop the event
5744          * before swizzling the event::rb pointer; if it's getting
5745          * unmapped, its aux_mmap_count will be 0 and it won't
5746          * restart. See the comment in __perf_pmu_output_stop().
5747          *
5748          * Data will inevitably be lost when set_output is done in
5749          * mid-air, but then again, whoever does it like this is
5750          * not in for the data anyway.
5751          */
5752         if (has_aux(event))
5753                 perf_event_stop(event, 0);
5754
5755         rcu_assign_pointer(event->rb, rb);
5756
5757         if (old_rb) {
5758                 ring_buffer_put(old_rb);
5759                 /*
5760                  * Since we detached before setting the new rb, so that we
5761                  * could attach the new rb, we could have missed a wakeup.
5762                  * Provide it now.
5763                  */
5764                 wake_up_all(&event->waitq);
5765         }
5766 }
5767
5768 static void ring_buffer_wakeup(struct perf_event *event)
5769 {
5770         struct perf_buffer *rb;
5771
5772         rcu_read_lock();
5773         rb = rcu_dereference(event->rb);
5774         if (rb) {
5775                 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5776                         wake_up_all(&event->waitq);
5777         }
5778         rcu_read_unlock();
5779 }
5780
5781 struct perf_buffer *ring_buffer_get(struct perf_event *event)
5782 {
5783         struct perf_buffer *rb;
5784
5785         rcu_read_lock();
5786         rb = rcu_dereference(event->rb);
5787         if (rb) {
5788                 if (!refcount_inc_not_zero(&rb->refcount))
5789                         rb = NULL;
5790         }
5791         rcu_read_unlock();
5792
5793         return rb;
5794 }
5795
5796 void ring_buffer_put(struct perf_buffer *rb)
5797 {
5798         if (!refcount_dec_and_test(&rb->refcount))
5799                 return;
5800
5801         WARN_ON_ONCE(!list_empty(&rb->event_list));
5802
5803         call_rcu(&rb->rcu_head, rb_free_rcu);
5804 }
5805
5806 static void perf_mmap_open(struct vm_area_struct *vma)
5807 {
5808         struct perf_event *event = vma->vm_file->private_data;
5809
5810         atomic_inc(&event->mmap_count);
5811         atomic_inc(&event->rb->mmap_count);
5812
5813         if (vma->vm_pgoff)
5814                 atomic_inc(&event->rb->aux_mmap_count);
5815
5816         if (event->pmu->event_mapped)
5817                 event->pmu->event_mapped(event, vma->vm_mm);
5818 }
5819
5820 static void perf_pmu_output_stop(struct perf_event *event);
5821
5822 /*
5823  * A buffer can be mmap()ed multiple times; either directly through the same
5824  * event, or through other events by use of perf_event_set_output().
5825  *
5826  * In order to undo the VM accounting done by perf_mmap() we need to destroy
5827  * the buffer here, where we still have a VM context. This means we need
5828  * to detach all events redirecting to us.
5829  */
5830 static void perf_mmap_close(struct vm_area_struct *vma)
5831 {
5832         struct perf_event *event = vma->vm_file->private_data;
5833
5834         struct perf_buffer *rb = ring_buffer_get(event);
5835         struct user_struct *mmap_user = rb->mmap_user;
5836         int mmap_locked = rb->mmap_locked;
5837         unsigned long size = perf_data_size(rb);
5838
5839         if (event->pmu->event_unmapped)
5840                 event->pmu->event_unmapped(event, vma->vm_mm);
5841
5842         /*
5843          * rb->aux_mmap_count will always drop before rb->mmap_count and
5844          * event->mmap_count, so it is ok to use event->mmap_mutex to
5845          * serialize with perf_mmap here.
5846          */
5847         if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5848             atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
5849                 /*
5850                  * Stop all AUX events that are writing to this buffer,
5851                  * so that we can free its AUX pages and corresponding PMU
5852                  * data. Note that after rb::aux_mmap_count dropped to zero,
5853                  * they won't start any more (see perf_aux_output_begin()).
5854                  */
5855                 perf_pmu_output_stop(event);
5856
5857                 /* now it's safe to free the pages */
5858                 atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
5859                 atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
5860
5861                 /* this has to be the last one */
5862                 rb_free_aux(rb);
5863                 WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
5864
5865                 mutex_unlock(&event->mmap_mutex);
5866         }
5867
5868         atomic_dec(&rb->mmap_count);
5869
5870         if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5871                 goto out_put;
5872
5873         ring_buffer_attach(event, NULL);
5874         mutex_unlock(&event->mmap_mutex);
5875
5876         /* If there's still other mmap()s of this buffer, we're done. */
5877         if (atomic_read(&rb->mmap_count))
5878                 goto out_put;
5879
5880         /*
5881          * No other mmap()s, detach from all other events that might redirect
5882          * into the now unreachable buffer. Somewhat complicated by the
5883          * fact that rb::event_lock otherwise nests inside mmap_mutex.
5884          */
5885 again:
5886         rcu_read_lock();
5887         list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5888                 if (!atomic_long_inc_not_zero(&event->refcount)) {
5889                         /*
5890                          * This event is en-route to free_event() which will
5891                          * detach it and remove it from the list.
5892                          */
5893                         continue;
5894                 }
5895                 rcu_read_unlock();
5896
5897                 mutex_lock(&event->mmap_mutex);
5898                 /*
5899                  * Check we didn't race with perf_event_set_output() which can
5900                  * swizzle the rb from under us while we were waiting to
5901                  * acquire mmap_mutex.
5902                  *
5903                  * If we find a different rb; ignore this event, a next
5904                  * iteration will no longer find it on the list. We have to
5905                  * still restart the iteration to make sure we're not now
5906                  * iterating the wrong list.
5907                  */
5908                 if (event->rb == rb)
5909                         ring_buffer_attach(event, NULL);
5910
5911                 mutex_unlock(&event->mmap_mutex);
5912                 put_event(event);
5913
5914                 /*
5915                  * Restart the iteration; either we're on the wrong list or
5916                  * destroyed its integrity by doing a deletion.
5917                  */
5918                 goto again;
5919         }
5920         rcu_read_unlock();
5921
5922         /*
5923          * It could be there's still a few 0-ref events on the list; they'll
5924          * get cleaned up by free_event() -- they'll also still have their
5925          * ref on the rb and will free it whenever they are done with it.
5926          *
5927          * Aside from that, this buffer is 'fully' detached and unmapped,
5928          * undo the VM accounting.
5929          */
5930
5931         atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
5932                         &mmap_user->locked_vm);
5933         atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
5934         free_uid(mmap_user);
5935
5936 out_put:
5937         ring_buffer_put(rb); /* could be last */
5938 }
5939
5940 static const struct vm_operations_struct perf_mmap_vmops = {
5941         .open           = perf_mmap_open,
5942         .close          = perf_mmap_close, /* non mergeable */
5943         .fault          = perf_mmap_fault,
5944         .page_mkwrite   = perf_mmap_fault,
5945 };
5946
5947 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
5948 {
5949         struct perf_event *event = file->private_data;
5950         unsigned long user_locked, user_lock_limit;
5951         struct user_struct *user = current_user();
5952         struct perf_buffer *rb = NULL;
5953         unsigned long locked, lock_limit;
5954         unsigned long vma_size;
5955         unsigned long nr_pages;
5956         long user_extra = 0, extra = 0;
5957         int ret = 0, flags = 0;
5958
5959         /*
5960          * Don't allow mmap() of inherited per-task counters. This would
5961          * create a performance issue due to all children writing to the
5962          * same rb.
5963          */
5964         if (event->cpu == -1 && event->attr.inherit)
5965                 return -EINVAL;
5966
5967         if (!(vma->vm_flags & VM_SHARED))
5968                 return -EINVAL;
5969
5970         ret = security_perf_event_read(event);
5971         if (ret)
5972                 return ret;
5973
5974         vma_size = vma->vm_end - vma->vm_start;
5975
5976         if (vma->vm_pgoff == 0) {
5977                 nr_pages = (vma_size / PAGE_SIZE) - 1;
5978         } else {
5979                 /*
5980                  * AUX area mapping: if rb->aux_nr_pages != 0, it's already
5981                  * mapped, all subsequent mappings should have the same size
5982                  * and offset. Must be above the normal perf buffer.
5983                  */
5984                 u64 aux_offset, aux_size;
5985
5986                 if (!event->rb)
5987                         return -EINVAL;
5988
5989                 nr_pages = vma_size / PAGE_SIZE;
5990
5991                 mutex_lock(&event->mmap_mutex);
5992                 ret = -EINVAL;
5993
5994                 rb = event->rb;
5995                 if (!rb)
5996                         goto aux_unlock;
5997
5998                 aux_offset = READ_ONCE(rb->user_page->aux_offset);
5999                 aux_size = READ_ONCE(rb->user_page->aux_size);
6000
6001                 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
6002                         goto aux_unlock;
6003
6004                 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
6005                         goto aux_unlock;
6006
6007                 /* already mapped with a different offset */
6008                 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
6009                         goto aux_unlock;
6010
6011                 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
6012                         goto aux_unlock;
6013
6014                 /* already mapped with a different size */
6015                 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
6016                         goto aux_unlock;
6017
6018                 if (!is_power_of_2(nr_pages))
6019                         goto aux_unlock;
6020
6021                 if (!atomic_inc_not_zero(&rb->mmap_count))
6022                         goto aux_unlock;
6023
6024                 if (rb_has_aux(rb)) {
6025                         atomic_inc(&rb->aux_mmap_count);
6026                         ret = 0;
6027                         goto unlock;
6028                 }
6029
6030                 atomic_set(&rb->aux_mmap_count, 1);
6031                 user_extra = nr_pages;
6032
6033                 goto accounting;
6034         }
6035
6036         /*
6037          * If we have rb pages ensure they're a power-of-two number, so we
6038          * can do bitmasks instead of modulo.
6039          */
6040         if (nr_pages != 0 && !is_power_of_2(nr_pages))
6041                 return -EINVAL;
6042
6043         if (vma_size != PAGE_SIZE * (1 + nr_pages))
6044                 return -EINVAL;
6045
6046         WARN_ON_ONCE(event->ctx->parent_ctx);
6047 again:
6048         mutex_lock(&event->mmap_mutex);
6049         if (event->rb) {
6050                 if (event->rb->nr_pages != nr_pages) {
6051                         ret = -EINVAL;
6052                         goto unlock;
6053                 }
6054
6055                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
6056                         /*
6057                          * Raced against perf_mmap_close() through
6058                          * perf_event_set_output(). Try again, hope for better
6059                          * luck.
6060                          */
6061                         mutex_unlock(&event->mmap_mutex);
6062                         goto again;
6063                 }
6064
6065                 goto unlock;
6066         }
6067
6068         user_extra = nr_pages + 1;
6069
6070 accounting:
6071         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
6072
6073         /*
6074          * Increase the limit linearly with more CPUs:
6075          */
6076         user_lock_limit *= num_online_cpus();
6077
6078         user_locked = atomic_long_read(&user->locked_vm);
6079
6080         /*
6081          * sysctl_perf_event_mlock may have changed, so that
6082          *     user->locked_vm > user_lock_limit
6083          */
6084         if (user_locked > user_lock_limit)
6085                 user_locked = user_lock_limit;
6086         user_locked += user_extra;
6087
6088         if (user_locked > user_lock_limit) {
6089                 /*
6090                  * charge locked_vm until it hits user_lock_limit;
6091                  * charge the rest from pinned_vm
6092                  */
6093                 extra = user_locked - user_lock_limit;
6094                 user_extra -= extra;
6095         }
6096
6097         lock_limit = rlimit(RLIMIT_MEMLOCK);
6098         lock_limit >>= PAGE_SHIFT;
6099         locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
6100
6101         if ((locked > lock_limit) && perf_is_paranoid() &&
6102                 !capable(CAP_IPC_LOCK)) {
6103                 ret = -EPERM;
6104                 goto unlock;
6105         }
6106
6107         WARN_ON(!rb && event->rb);
6108
6109         if (vma->vm_flags & VM_WRITE)
6110                 flags |= RING_BUFFER_WRITABLE;
6111
6112         if (!rb) {
6113                 rb = rb_alloc(nr_pages,
6114                               event->attr.watermark ? event->attr.wakeup_watermark : 0,
6115                               event->cpu, flags);
6116
6117                 if (!rb) {
6118                         ret = -ENOMEM;
6119                         goto unlock;
6120                 }
6121
6122                 atomic_set(&rb->mmap_count, 1);
6123                 rb->mmap_user = get_current_user();
6124                 rb->mmap_locked = extra;
6125
6126                 ring_buffer_attach(event, rb);
6127
6128                 perf_event_init_userpage(event);
6129                 perf_event_update_userpage(event);
6130         } else {
6131                 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
6132                                    event->attr.aux_watermark, flags);
6133                 if (!ret)
6134                         rb->aux_mmap_locked = extra;
6135         }
6136
6137 unlock:
6138         if (!ret) {
6139                 atomic_long_add(user_extra, &user->locked_vm);
6140                 atomic64_add(extra, &vma->vm_mm->pinned_vm);
6141
6142                 atomic_inc(&event->mmap_count);
6143         } else if (rb) {
6144                 atomic_dec(&rb->mmap_count);
6145         }
6146 aux_unlock:
6147         mutex_unlock(&event->mmap_mutex);
6148
6149         /*
6150          * Since pinned accounting is per vm we cannot allow fork() to copy our
6151          * vma.
6152          */
6153         vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
6154         vma->vm_ops = &perf_mmap_vmops;
6155
6156         if (event->pmu->event_mapped)
6157                 event->pmu->event_mapped(event, vma->vm_mm);
6158
6159         return ret;
6160 }
6161
6162 static int perf_fasync(int fd, struct file *filp, int on)
6163 {
6164         struct inode *inode = file_inode(filp);
6165         struct perf_event *event = filp->private_data;
6166         int retval;
6167
6168         inode_lock(inode);
6169         retval = fasync_helper(fd, filp, on, &event->fasync);
6170         inode_unlock(inode);
6171
6172         if (retval < 0)
6173                 return retval;
6174
6175         return 0;
6176 }
6177
6178 static const struct file_operations perf_fops = {
6179         .llseek                 = no_llseek,
6180         .release                = perf_release,
6181         .read                   = perf_read,
6182         .poll                   = perf_poll,
6183         .unlocked_ioctl         = perf_ioctl,
6184         .compat_ioctl           = perf_compat_ioctl,
6185         .mmap                   = perf_mmap,
6186         .fasync                 = perf_fasync,
6187 };
6188
6189 /*
6190  * Perf event wakeup
6191  *
6192  * If there's data, ensure we set the poll() state and publish everything
6193  * to user-space before waking everybody up.
6194  */
6195
6196 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
6197 {
6198         /* only the parent has fasync state */
6199         if (event->parent)
6200                 event = event->parent;
6201         return &event->fasync;
6202 }
6203
6204 void perf_event_wakeup(struct perf_event *event)
6205 {
6206         ring_buffer_wakeup(event);
6207
6208         if (event->pending_kill) {
6209                 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
6210                 event->pending_kill = 0;
6211         }
6212 }
6213
6214 static void perf_pending_event_disable(struct perf_event *event)
6215 {
6216         int cpu = READ_ONCE(event->pending_disable);
6217
6218         if (cpu < 0)
6219                 return;
6220
6221         if (cpu == smp_processor_id()) {
6222                 WRITE_ONCE(event->pending_disable, -1);
6223                 perf_event_disable_local(event);
6224                 return;
6225         }
6226
6227         /*
6228          *  CPU-A                       CPU-B
6229          *
6230          *  perf_event_disable_inatomic()
6231          *    @pending_disable = CPU-A;
6232          *    irq_work_queue();
6233          *
6234          *  sched-out
6235          *    @pending_disable = -1;
6236          *
6237          *                              sched-in
6238          *                              perf_event_disable_inatomic()
6239          *                                @pending_disable = CPU-B;
6240          *                                irq_work_queue(); // FAILS
6241          *
6242          *  irq_work_run()
6243          *    perf_pending_event()
6244          *
6245          * But the event runs on CPU-B and wants disabling there.
6246          */
6247         irq_work_queue_on(&event->pending, cpu);
6248 }
6249
6250 static void perf_pending_event(struct irq_work *entry)
6251 {
6252         struct perf_event *event = container_of(entry, struct perf_event, pending);
6253         int rctx;
6254
6255         rctx = perf_swevent_get_recursion_context();
6256         /*
6257          * If we 'fail' here, that's OK, it means recursion is already disabled
6258          * and we won't recurse 'further'.
6259          */
6260
6261         perf_pending_event_disable(event);
6262
6263         if (event->pending_wakeup) {
6264                 event->pending_wakeup = 0;
6265                 perf_event_wakeup(event);
6266         }
6267
6268         if (rctx >= 0)
6269                 perf_swevent_put_recursion_context(rctx);
6270 }
6271
6272 /*
6273  * We assume there is only KVM supporting the callbacks.
6274  * Later on, we might change it to a list if there is
6275  * another virtualization implementation supporting the callbacks.
6276  */
6277 struct perf_guest_info_callbacks *perf_guest_cbs;
6278
6279 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6280 {
6281         perf_guest_cbs = cbs;
6282         return 0;
6283 }
6284 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
6285
6286 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6287 {
6288         perf_guest_cbs = NULL;
6289         return 0;
6290 }
6291 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
6292
6293 static void
6294 perf_output_sample_regs(struct perf_output_handle *handle,
6295                         struct pt_regs *regs, u64 mask)
6296 {
6297         int bit;
6298         DECLARE_BITMAP(_mask, 64);
6299
6300         bitmap_from_u64(_mask, mask);
6301         for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
6302                 u64 val;
6303
6304                 val = perf_reg_value(regs, bit);
6305                 perf_output_put(handle, val);
6306         }
6307 }
6308
6309 static void perf_sample_regs_user(struct perf_regs *regs_user,
6310                                   struct pt_regs *regs,
6311                                   struct pt_regs *regs_user_copy)
6312 {
6313         if (user_mode(regs)) {
6314                 regs_user->abi = perf_reg_abi(current);
6315                 regs_user->regs = regs;
6316         } else if (!(current->flags & PF_KTHREAD)) {
6317                 perf_get_regs_user(regs_user, regs, regs_user_copy);
6318         } else {
6319                 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
6320                 regs_user->regs = NULL;
6321         }
6322 }
6323
6324 static void perf_sample_regs_intr(struct perf_regs *regs_intr,
6325                                   struct pt_regs *regs)
6326 {
6327         regs_intr->regs = regs;
6328         regs_intr->abi  = perf_reg_abi(current);
6329 }
6330
6331
6332 /*
6333  * Get remaining task size from user stack pointer.
6334  *
6335  * It'd be better to take stack vma map and limit this more
6336  * precisely, but there's no way to get it safely under interrupt,
6337  * so using TASK_SIZE as limit.
6338  */
6339 static u64 perf_ustack_task_size(struct pt_regs *regs)
6340 {
6341         unsigned long addr = perf_user_stack_pointer(regs);
6342
6343         if (!addr || addr >= TASK_SIZE)
6344                 return 0;
6345
6346         return TASK_SIZE - addr;
6347 }
6348
6349 static u16
6350 perf_sample_ustack_size(u16 stack_size, u16 header_size,
6351                         struct pt_regs *regs)
6352 {
6353         u64 task_size;
6354
6355         /* No regs, no stack pointer, no dump. */
6356         if (!regs)
6357                 return 0;
6358
6359         /*
6360          * Check if we fit in with the requested stack size into the:
6361          * - TASK_SIZE
6362          *   If we don't, we limit the size to the TASK_SIZE.
6363          *
6364          * - remaining sample size
6365          *   If we don't, we customize the stack size to
6366          *   fit in to the remaining sample size.
6367          */
6368
6369         task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
6370         stack_size = min(stack_size, (u16) task_size);
6371
6372         /* Current header size plus static size and dynamic size. */
6373         header_size += 2 * sizeof(u64);
6374
6375         /* Do we fit in with the current stack dump size? */
6376         if ((u16) (header_size + stack_size) < header_size) {
6377                 /*
6378                  * If we overflow the maximum size for the sample,
6379                  * we customize the stack dump size to fit in.
6380                  */
6381                 stack_size = USHRT_MAX - header_size - sizeof(u64);
6382                 stack_size = round_up(stack_size, sizeof(u64));
6383         }
6384
6385         return stack_size;
6386 }
6387
6388 static void
6389 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
6390                           struct pt_regs *regs)
6391 {
6392         /* Case of a kernel thread, nothing to dump */
6393         if (!regs) {
6394                 u64 size = 0;
6395                 perf_output_put(handle, size);
6396         } else {
6397                 unsigned long sp;
6398                 unsigned int rem;
6399                 u64 dyn_size;
6400                 mm_segment_t fs;
6401
6402                 /*
6403                  * We dump:
6404                  * static size
6405                  *   - the size requested by user or the best one we can fit
6406                  *     in to the sample max size
6407                  * data
6408                  *   - user stack dump data
6409                  * dynamic size
6410                  *   - the actual dumped size
6411                  */
6412
6413                 /* Static size. */
6414                 perf_output_put(handle, dump_size);
6415
6416                 /* Data. */
6417                 sp = perf_user_stack_pointer(regs);
6418                 fs = get_fs();
6419                 set_fs(USER_DS);
6420                 rem = __output_copy_user(handle, (void *) sp, dump_size);
6421                 set_fs(fs);
6422                 dyn_size = dump_size - rem;
6423
6424                 perf_output_skip(handle, rem);
6425
6426                 /* Dynamic size. */
6427                 perf_output_put(handle, dyn_size);
6428         }
6429 }
6430
6431 static unsigned long perf_prepare_sample_aux(struct perf_event *event,
6432                                           struct perf_sample_data *data,
6433                                           size_t size)
6434 {
6435         struct perf_event *sampler = event->aux_event;
6436         struct perf_buffer *rb;
6437
6438         data->aux_size = 0;
6439
6440         if (!sampler)
6441                 goto out;
6442
6443         if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
6444                 goto out;
6445
6446         if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
6447                 goto out;
6448
6449         rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
6450         if (!rb)
6451                 goto out;
6452
6453         /*
6454          * If this is an NMI hit inside sampling code, don't take
6455          * the sample. See also perf_aux_sample_output().
6456          */
6457         if (READ_ONCE(rb->aux_in_sampling)) {
6458                 data->aux_size = 0;
6459         } else {
6460                 size = min_t(size_t, size, perf_aux_size(rb));
6461                 data->aux_size = ALIGN(size, sizeof(u64));
6462         }
6463         ring_buffer_put(rb);
6464
6465 out:
6466         return data->aux_size;
6467 }
6468
6469 long perf_pmu_snapshot_aux(struct perf_buffer *rb,
6470                            struct perf_event *event,
6471                            struct perf_output_handle *handle,
6472                            unsigned long size)
6473 {
6474         unsigned long flags;
6475         long ret;
6476
6477         /*
6478          * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
6479          * paths. If we start calling them in NMI context, they may race with
6480          * the IRQ ones, that is, for example, re-starting an event that's just
6481          * been stopped, which is why we're using a separate callback that
6482          * doesn't change the event state.
6483          *
6484          * IRQs need to be disabled to prevent IPIs from racing with us.
6485          */
6486         local_irq_save(flags);
6487         /*
6488          * Guard against NMI hits inside the critical section;
6489          * see also perf_prepare_sample_aux().
6490          */
6491         WRITE_ONCE(rb->aux_in_sampling, 1);
6492         barrier();
6493
6494         ret = event->pmu->snapshot_aux(event, handle, size);
6495
6496         barrier();
6497         WRITE_ONCE(rb->aux_in_sampling, 0);
6498         local_irq_restore(flags);
6499
6500         return ret;
6501 }
6502
6503 static void perf_aux_sample_output(struct perf_event *event,
6504                                    struct perf_output_handle *handle,
6505                                    struct perf_sample_data *data)
6506 {
6507         struct perf_event *sampler = event->aux_event;
6508         struct perf_buffer *rb;
6509         unsigned long pad;
6510         long size;
6511
6512         if (WARN_ON_ONCE(!sampler || !data->aux_size))
6513                 return;
6514
6515         rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
6516         if (!rb)
6517                 return;
6518
6519         size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
6520
6521         /*
6522          * An error here means that perf_output_copy() failed (returned a
6523          * non-zero surplus that it didn't copy), which in its current
6524          * enlightened implementation is not possible. If that changes, we'd
6525          * like to know.
6526          */
6527         if (WARN_ON_ONCE(size < 0))
6528                 goto out_put;
6529
6530         /*
6531          * The pad comes from ALIGN()ing data->aux_size up to u64 in
6532          * perf_prepare_sample_aux(), so should not be more than that.
6533          */
6534         pad = data->aux_size - size;
6535         if (WARN_ON_ONCE(pad >= sizeof(u64)))
6536                 pad = 8;
6537
6538         if (pad) {
6539                 u64 zero = 0;
6540                 perf_output_copy(handle, &zero, pad);
6541         }
6542
6543 out_put:
6544         ring_buffer_put(rb);
6545 }
6546
6547 static void __perf_event_header__init_id(struct perf_event_header *header,
6548                                          struct perf_sample_data *data,
6549                                          struct perf_event *event)
6550 {
6551         u64 sample_type = event->attr.sample_type;
6552
6553         data->type = sample_type;
6554         header->size += event->id_header_size;
6555
6556         if (sample_type & PERF_SAMPLE_TID) {
6557                 /* namespace issues */
6558                 data->tid_entry.pid = perf_event_pid(event, current);
6559                 data->tid_entry.tid = perf_event_tid(event, current);
6560         }
6561
6562         if (sample_type & PERF_SAMPLE_TIME)
6563                 data->time = perf_event_clock(event);
6564
6565         if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
6566                 data->id = primary_event_id(event);
6567
6568         if (sample_type & PERF_SAMPLE_STREAM_ID)
6569                 data->stream_id = event->id;
6570
6571         if (sample_type & PERF_SAMPLE_CPU) {
6572                 data->cpu_entry.cpu      = raw_smp_processor_id();
6573                 data->cpu_entry.reserved = 0;
6574         }
6575 }
6576
6577 void perf_event_header__init_id(struct perf_event_header *header,
6578                                 struct perf_sample_data *data,
6579                                 struct perf_event *event)
6580 {
6581         if (event->attr.sample_id_all)
6582                 __perf_event_header__init_id(header, data, event);
6583 }
6584
6585 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
6586                                            struct perf_sample_data *data)
6587 {
6588         u64 sample_type = data->type;
6589
6590         if (sample_type & PERF_SAMPLE_TID)
6591                 perf_output_put(handle, data->tid_entry);
6592
6593         if (sample_type & PERF_SAMPLE_TIME)
6594                 perf_output_put(handle, data->time);
6595
6596         if (sample_type & PERF_SAMPLE_ID)
6597                 perf_output_put(handle, data->id);
6598
6599         if (sample_type & PERF_SAMPLE_STREAM_ID)
6600                 perf_output_put(handle, data->stream_id);
6601
6602         if (sample_type & PERF_SAMPLE_CPU)
6603                 perf_output_put(handle, data->cpu_entry);
6604
6605         if (sample_type & PERF_SAMPLE_IDENTIFIER)
6606                 perf_output_put(handle, data->id);
6607 }
6608
6609 void perf_event__output_id_sample(struct perf_event *event,
6610                                   struct perf_output_handle *handle,
6611                                   struct perf_sample_data *sample)
6612 {
6613         if (event->attr.sample_id_all)
6614                 __perf_event__output_id_sample(handle, sample);
6615 }
6616
6617 static void perf_output_read_one(struct perf_output_handle *handle,
6618                                  struct perf_event *event,
6619                                  u64 enabled, u64 running)
6620 {
6621         u64 read_format = event->attr.read_format;
6622         u64 values[4];
6623         int n = 0;
6624
6625         values[n++] = perf_event_count(event);
6626         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
6627                 values[n++] = enabled +
6628                         atomic64_read(&event->child_total_time_enabled);
6629         }
6630         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
6631                 values[n++] = running +
6632                         atomic64_read(&event->child_total_time_running);
6633         }
6634         if (read_format & PERF_FORMAT_ID)
6635                 values[n++] = primary_event_id(event);
6636
6637         __output_copy(handle, values, n * sizeof(u64));
6638 }
6639
6640 static void perf_output_read_group(struct perf_output_handle *handle,
6641                             struct perf_event *event,
6642                             u64 enabled, u64 running)
6643 {
6644         struct perf_event *leader = event->group_leader, *sub;
6645         u64 read_format = event->attr.read_format;
6646         u64 values[5];
6647         int n = 0;
6648
6649         values[n++] = 1 + leader->nr_siblings;
6650
6651         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
6652                 values[n++] = enabled;
6653
6654         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
6655                 values[n++] = running;
6656
6657         if ((leader != event) &&
6658             (leader->state == PERF_EVENT_STATE_ACTIVE))
6659                 leader->pmu->read(leader);
6660
6661         values[n++] = perf_event_count(leader);
6662         if (read_format & PERF_FORMAT_ID)
6663                 values[n++] = primary_event_id(leader);
6664
6665         __output_copy(handle, values, n * sizeof(u64));
6666
6667         for_each_sibling_event(sub, leader) {
6668                 n = 0;
6669
6670                 if ((sub != event) &&
6671                     (sub->state == PERF_EVENT_STATE_ACTIVE))
6672                         sub->pmu->read(sub);
6673
6674                 values[n++] = perf_event_count(sub);
6675                 if (read_format & PERF_FORMAT_ID)
6676                         values[n++] = primary_event_id(sub);
6677
6678                 __output_copy(handle, values, n * sizeof(u64));
6679         }
6680 }
6681
6682 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
6683                                  PERF_FORMAT_TOTAL_TIME_RUNNING)
6684
6685 /*
6686  * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
6687  *
6688  * The problem is that its both hard and excessively expensive to iterate the
6689  * child list, not to mention that its impossible to IPI the children running
6690  * on another CPU, from interrupt/NMI context.
6691  */
6692 static void perf_output_read(struct perf_output_handle *handle,
6693                              struct perf_event *event)
6694 {
6695         u64 enabled = 0, running = 0, now;
6696         u64 read_format = event->attr.read_format;
6697
6698         /*
6699          * compute total_time_enabled, total_time_running
6700          * based on snapshot values taken when the event
6701          * was last scheduled in.
6702          *
6703          * we cannot simply called update_context_time()
6704          * because of locking issue as we are called in
6705          * NMI context
6706          */
6707         if (read_format & PERF_FORMAT_TOTAL_TIMES)
6708                 calc_timer_values(event, &now, &enabled, &running);
6709
6710         if (event->attr.read_format & PERF_FORMAT_GROUP)
6711                 perf_output_read_group(handle, event, enabled, running);
6712         else
6713                 perf_output_read_one(handle, event, enabled, running);
6714 }
6715
6716 static inline bool perf_sample_save_hw_index(struct perf_event *event)
6717 {
6718         return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
6719 }
6720
6721 void perf_output_sample(struct perf_output_handle *handle,
6722                         struct perf_event_header *header,
6723                         struct perf_sample_data *data,
6724                         struct perf_event *event)
6725 {
6726         u64 sample_type = data->type;
6727
6728         perf_output_put(handle, *header);
6729
6730         if (sample_type & PERF_SAMPLE_IDENTIFIER)
6731                 perf_output_put(handle, data->id);
6732
6733         if (sample_type & PERF_SAMPLE_IP)
6734                 perf_output_put(handle, data->ip);
6735
6736         if (sample_type & PERF_SAMPLE_TID)
6737                 perf_output_put(handle, data->tid_entry);
6738
6739         if (sample_type & PERF_SAMPLE_TIME)
6740                 perf_output_put(handle, data->time);
6741
6742         if (sample_type & PERF_SAMPLE_ADDR)
6743                 perf_output_put(handle, data->addr);
6744
6745         if (sample_type & PERF_SAMPLE_ID)
6746                 perf_output_put(handle, data->id);
6747
6748         if (sample_type & PERF_SAMPLE_STREAM_ID)
6749                 perf_output_put(handle, data->stream_id);
6750
6751         if (sample_type & PERF_SAMPLE_CPU)
6752                 perf_output_put(handle, data->cpu_entry);
6753
6754         if (sample_type & PERF_SAMPLE_PERIOD)
6755                 perf_output_put(handle, data->period);
6756
6757         if (sample_type & PERF_SAMPLE_READ)
6758                 perf_output_read(handle, event);
6759
6760         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6761                 int size = 1;
6762
6763                 size += data->callchain->nr;
6764                 size *= sizeof(u64);
6765                 __output_copy(handle, data->callchain, size);
6766         }
6767
6768         if (sample_type & PERF_SAMPLE_RAW) {
6769                 struct perf_raw_record *raw = data->raw;
6770
6771                 if (raw) {
6772                         struct perf_raw_frag *frag = &raw->frag;
6773
6774                         perf_output_put(handle, raw->size);
6775                         do {
6776                                 if (frag->copy) {
6777                                         __output_custom(handle, frag->copy,
6778                                                         frag->data, frag->size);
6779                                 } else {
6780                                         __output_copy(handle, frag->data,
6781                                                       frag->size);
6782                                 }
6783                                 if (perf_raw_frag_last(frag))
6784                                         break;
6785                                 frag = frag->next;
6786                         } while (1);
6787                         if (frag->pad)
6788                                 __output_skip(handle, NULL, frag->pad);
6789                 } else {
6790                         struct {
6791                                 u32     size;
6792                                 u32     data;
6793                         } raw = {
6794                                 .size = sizeof(u32),
6795                                 .data = 0,
6796                         };
6797                         perf_output_put(handle, raw);
6798                 }
6799         }
6800
6801         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6802                 if (data->br_stack) {
6803                         size_t size;
6804
6805                         size = data->br_stack->nr
6806                              * sizeof(struct perf_branch_entry);
6807
6808                         perf_output_put(handle, data->br_stack->nr);
6809                         if (perf_sample_save_hw_index(event))
6810                                 perf_output_put(handle, data->br_stack->hw_idx);
6811                         perf_output_copy(handle, data->br_stack->entries, size);
6812                 } else {
6813                         /*
6814                          * we always store at least the value of nr
6815                          */
6816                         u64 nr = 0;
6817                         perf_output_put(handle, nr);
6818                 }
6819         }
6820
6821         if (sample_type & PERF_SAMPLE_REGS_USER) {
6822                 u64 abi = data->regs_user.abi;
6823
6824                 /*
6825                  * If there are no regs to dump, notice it through
6826                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
6827                  */
6828                 perf_output_put(handle, abi);
6829
6830                 if (abi) {
6831                         u64 mask = event->attr.sample_regs_user;
6832                         perf_output_sample_regs(handle,
6833                                                 data->regs_user.regs,
6834                                                 mask);
6835                 }
6836         }
6837
6838         if (sample_type & PERF_SAMPLE_STACK_USER) {
6839                 perf_output_sample_ustack(handle,
6840                                           data->stack_user_size,
6841                                           data->regs_user.regs);
6842         }
6843
6844         if (sample_type & PERF_SAMPLE_WEIGHT)
6845                 perf_output_put(handle, data->weight);
6846
6847         if (sample_type & PERF_SAMPLE_DATA_SRC)
6848                 perf_output_put(handle, data->data_src.val);
6849
6850         if (sample_type & PERF_SAMPLE_TRANSACTION)
6851                 perf_output_put(handle, data->txn);
6852
6853         if (sample_type & PERF_SAMPLE_REGS_INTR) {
6854                 u64 abi = data->regs_intr.abi;
6855                 /*
6856                  * If there are no regs to dump, notice it through
6857                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
6858                  */
6859                 perf_output_put(handle, abi);
6860
6861                 if (abi) {
6862                         u64 mask = event->attr.sample_regs_intr;
6863
6864                         perf_output_sample_regs(handle,
6865                                                 data->regs_intr.regs,
6866                                                 mask);
6867                 }
6868         }
6869
6870         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6871                 perf_output_put(handle, data->phys_addr);
6872
6873         if (sample_type & PERF_SAMPLE_CGROUP)
6874                 perf_output_put(handle, data->cgroup);
6875
6876         if (sample_type & PERF_SAMPLE_AUX) {
6877                 perf_output_put(handle, data->aux_size);
6878
6879                 if (data->aux_size)
6880                         perf_aux_sample_output(event, handle, data);
6881         }
6882
6883         if (!event->attr.watermark) {
6884                 int wakeup_events = event->attr.wakeup_events;
6885
6886                 if (wakeup_events) {
6887                         struct perf_buffer *rb = handle->rb;
6888                         int events = local_inc_return(&rb->events);
6889
6890                         if (events >= wakeup_events) {
6891                                 local_sub(wakeup_events, &rb->events);
6892                                 local_inc(&rb->wakeup);
6893                         }
6894                 }
6895         }
6896 }
6897
6898 static u64 perf_virt_to_phys(u64 virt)
6899 {
6900         u64 phys_addr = 0;
6901         struct page *p = NULL;
6902
6903         if (!virt)
6904                 return 0;
6905
6906         if (virt >= TASK_SIZE) {
6907                 /* If it's vmalloc()d memory, leave phys_addr as 0 */
6908                 if (virt_addr_valid((void *)(uintptr_t)virt) &&
6909                     !(virt >= VMALLOC_START && virt < VMALLOC_END))
6910                         phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
6911         } else {
6912                 /*
6913                  * Walking the pages tables for user address.
6914                  * Interrupts are disabled, so it prevents any tear down
6915                  * of the page tables.
6916                  * Try IRQ-safe __get_user_pages_fast first.
6917                  * If failed, leave phys_addr as 0.
6918                  */
6919                 if ((current->mm != NULL) &&
6920                     (__get_user_pages_fast(virt, 1, 0, &p) == 1))
6921                         phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
6922
6923                 if (p)
6924                         put_page(p);
6925         }
6926
6927         return phys_addr;
6928 }
6929
6930 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
6931
6932 struct perf_callchain_entry *
6933 perf_callchain(struct perf_event *event, struct pt_regs *regs)
6934 {
6935         bool kernel = !event->attr.exclude_callchain_kernel;
6936         bool user   = !event->attr.exclude_callchain_user;
6937         /* Disallow cross-task user callchains. */
6938         bool crosstask = event->ctx->task && event->ctx->task != current;
6939         const u32 max_stack = event->attr.sample_max_stack;
6940         struct perf_callchain_entry *callchain;
6941
6942         if (!kernel && !user)
6943                 return &__empty_callchain;
6944
6945         callchain = get_perf_callchain(regs, 0, kernel, user,
6946                                        max_stack, crosstask, true);
6947         return callchain ?: &__empty_callchain;
6948 }
6949
6950 void perf_prepare_sample(struct perf_event_header *header,
6951                          struct perf_sample_data *data,
6952                          struct perf_event *event,
6953                          struct pt_regs *regs)
6954 {
6955         u64 sample_type = event->attr.sample_type;
6956
6957         header->type = PERF_RECORD_SAMPLE;
6958         header->size = sizeof(*header) + event->header_size;
6959
6960         header->misc = 0;
6961         header->misc |= perf_misc_flags(regs);
6962
6963         __perf_event_header__init_id(header, data, event);
6964
6965         if (sample_type & PERF_SAMPLE_IP)
6966                 data->ip = perf_instruction_pointer(regs);
6967
6968         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6969                 int size = 1;
6970
6971                 if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
6972                         data->callchain = perf_callchain(event, regs);
6973
6974                 size += data->callchain->nr;
6975
6976                 header->size += size * sizeof(u64);
6977         }
6978
6979         if (sample_type & PERF_SAMPLE_RAW) {
6980                 struct perf_raw_record *raw = data->raw;
6981                 int size;
6982
6983                 if (raw) {
6984                         struct perf_raw_frag *frag = &raw->frag;
6985                         u32 sum = 0;
6986
6987                         do {
6988                                 sum += frag->size;
6989                                 if (perf_raw_frag_last(frag))
6990                                         break;
6991                                 frag = frag->next;
6992                         } while (1);
6993
6994                         size = round_up(sum + sizeof(u32), sizeof(u64));
6995                         raw->size = size - sizeof(u32);
6996                         frag->pad = raw->size - sum;
6997                 } else {
6998                         size = sizeof(u64);
6999                 }
7000
7001                 header->size += size;
7002         }
7003
7004         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
7005                 int size = sizeof(u64); /* nr */
7006                 if (data->br_stack) {
7007                         if (perf_sample_save_hw_index(event))
7008                                 size += sizeof(u64);
7009
7010                         size += data->br_stack->nr
7011                               * sizeof(struct perf_branch_entry);
7012                 }
7013                 header->size += size;
7014         }
7015
7016         if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
7017                 perf_sample_regs_user(&data->regs_user, regs,
7018                                       &data->regs_user_copy);
7019
7020         if (sample_type & PERF_SAMPLE_REGS_USER) {
7021                 /* regs dump ABI info */
7022                 int size = sizeof(u64);
7023
7024                 if (data->regs_user.regs) {
7025                         u64 mask = event->attr.sample_regs_user;
7026                         size += hweight64(mask) * sizeof(u64);
7027                 }
7028
7029                 header->size += size;
7030         }
7031
7032         if (sample_type & PERF_SAMPLE_STACK_USER) {
7033                 /*
7034                  * Either we need PERF_SAMPLE_STACK_USER bit to be always
7035                  * processed as the last one or have additional check added
7036                  * in case new sample type is added, because we could eat
7037                  * up the rest of the sample size.
7038                  */
7039                 u16 stack_size = event->attr.sample_stack_user;
7040                 u16 size = sizeof(u64);
7041
7042                 stack_size = perf_sample_ustack_size(stack_size, header->size,
7043                                                      data->regs_user.regs);
7044
7045                 /*
7046                  * If there is something to dump, add space for the dump
7047                  * itself and for the field that tells the dynamic size,
7048                  * which is how many have been actually dumped.
7049                  */
7050                 if (stack_size)
7051                         size += sizeof(u64) + stack_size;
7052
7053                 data->stack_user_size = stack_size;
7054                 header->size += size;
7055         }
7056
7057         if (sample_type & PERF_SAMPLE_REGS_INTR) {
7058                 /* regs dump ABI info */
7059                 int size = sizeof(u64);
7060
7061                 perf_sample_regs_intr(&data->regs_intr, regs);
7062
7063                 if (data->regs_intr.regs) {
7064                         u64 mask = event->attr.sample_regs_intr;
7065
7066                         size += hweight64(mask) * sizeof(u64);
7067                 }
7068
7069                 header->size += size;
7070         }
7071
7072         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
7073                 data->phys_addr = perf_virt_to_phys(data->addr);
7074
7075 #ifdef CONFIG_CGROUP_PERF
7076         if (sample_type & PERF_SAMPLE_CGROUP) {
7077                 struct cgroup *cgrp;
7078
7079                 /* protected by RCU */
7080                 cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
7081                 data->cgroup = cgroup_id(cgrp);
7082         }
7083 #endif
7084
7085         if (sample_type & PERF_SAMPLE_AUX) {
7086                 u64 size;
7087
7088                 header->size += sizeof(u64); /* size */
7089
7090                 /*
7091                  * Given the 16bit nature of header::size, an AUX sample can
7092                  * easily overflow it, what with all the preceding sample bits.
7093                  * Make sure this doesn't happen by using up to U16_MAX bytes
7094                  * per sample in total (rounded down to 8 byte boundary).
7095                  */
7096                 size = min_t(size_t, U16_MAX - header->size,
7097                              event->attr.aux_sample_size);
7098                 size = rounddown(size, 8);
7099                 size = perf_prepare_sample_aux(event, data, size);
7100
7101                 WARN_ON_ONCE(size + header->size > U16_MAX);
7102                 header->size += size;
7103         }
7104         /*
7105          * If you're adding more sample types here, you likely need to do
7106          * something about the overflowing header::size, like repurpose the
7107          * lowest 3 bits of size, which should be always zero at the moment.
7108          * This raises a more important question, do we really need 512k sized
7109          * samples and why, so good argumentation is in order for whatever you
7110          * do here next.
7111          */
7112         WARN_ON_ONCE(header->size & 7);
7113 }
7114
7115 static __always_inline int
7116 __perf_event_output(struct perf_event *event,
7117                     struct perf_sample_data *data,
7118                     struct pt_regs *regs,
7119                     int (*output_begin)(struct perf_output_handle *,
7120                                         struct perf_event *,
7121                                         unsigned int))
7122 {
7123         struct perf_output_handle handle;
7124         struct perf_event_header header;
7125         int err;
7126
7127         /* protect the callchain buffers */
7128         rcu_read_lock();
7129
7130         perf_prepare_sample(&header, data, event, regs);
7131
7132         err = output_begin(&handle, event, header.size);
7133         if (err)
7134                 goto exit;
7135
7136         perf_output_sample(&handle, &header, data, event);
7137
7138         perf_output_end(&handle);
7139
7140 exit:
7141         rcu_read_unlock();
7142         return err;
7143 }
7144
7145 void
7146 perf_event_output_forward(struct perf_event *event,
7147                          struct perf_sample_data *data,
7148                          struct pt_regs *regs)
7149 {
7150         __perf_event_output(event, data, regs, perf_output_begin_forward);
7151 }
7152
7153 void
7154 perf_event_output_backward(struct perf_event *event,
7155                            struct perf_sample_data *data,
7156                            struct pt_regs *regs)
7157 {
7158         __perf_event_output(event, data, regs, perf_output_begin_backward);
7159 }
7160
7161 int
7162 perf_event_output(struct perf_event *event,
7163                   struct perf_sample_data *data,
7164                   struct pt_regs *regs)
7165 {
7166         return __perf_event_output(event, data, regs, perf_output_begin);
7167 }
7168
7169 /*
7170  * read event_id
7171  */
7172
7173 struct perf_read_event {
7174         struct perf_event_header        header;
7175
7176         u32                             pid;
7177         u32                             tid;
7178 };
7179
7180 static void
7181 perf_event_read_event(struct perf_event *event,
7182                         struct task_struct *task)
7183 {
7184         struct perf_output_handle handle;
7185         struct perf_sample_data sample;
7186         struct perf_read_event read_event = {
7187                 .header = {
7188                         .type = PERF_RECORD_READ,
7189                         .misc = 0,
7190                         .size = sizeof(read_event) + event->read_size,
7191                 },
7192                 .pid = perf_event_pid(event, task),
7193                 .tid = perf_event_tid(event, task),
7194         };
7195         int ret;
7196
7197         perf_event_header__init_id(&read_event.header, &sample, event);
7198         ret = perf_output_begin(&handle, event, read_event.header.size);
7199         if (ret)
7200                 return;
7201
7202         perf_output_put(&handle, read_event);
7203         perf_output_read(&handle, event);
7204         perf_event__output_id_sample(event, &handle, &sample);
7205
7206         perf_output_end(&handle);
7207 }
7208
7209 typedef void (perf_iterate_f)(struct perf_event *event, void *data);
7210
7211 static void
7212 perf_iterate_ctx(struct perf_event_context *ctx,
7213                    perf_iterate_f output,
7214                    void *data, bool all)
7215 {
7216         struct perf_event *event;
7217
7218         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
7219                 if (!all) {
7220                         if (event->state < PERF_EVENT_STATE_INACTIVE)
7221                                 continue;
7222                         if (!event_filter_match(event))
7223                                 continue;
7224                 }
7225
7226                 output(event, data);
7227         }
7228 }
7229
7230 static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
7231 {
7232         struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
7233         struct perf_event *event;
7234
7235         list_for_each_entry_rcu(event, &pel->list, sb_list) {
7236                 /*
7237                  * Skip events that are not fully formed yet; ensure that
7238                  * if we observe event->ctx, both event and ctx will be
7239                  * complete enough. See perf_install_in_context().
7240                  */
7241                 if (!smp_load_acquire(&event->ctx))
7242                         continue;
7243
7244                 if (event->state < PERF_EVENT_STATE_INACTIVE)
7245                         continue;
7246                 if (!event_filter_match(event))
7247                         continue;
7248                 output(event, data);
7249         }
7250 }
7251
7252 /*
7253  * Iterate all events that need to receive side-band events.
7254  *
7255  * For new callers; ensure that account_pmu_sb_event() includes
7256  * your event, otherwise it might not get delivered.
7257  */
7258 static void
7259 perf_iterate_sb(perf_iterate_f output, void *data,
7260                struct perf_event_context *task_ctx)
7261 {
7262         struct perf_event_context *ctx;
7263         int ctxn;
7264
7265         rcu_read_lock();
7266         preempt_disable();
7267
7268         /*
7269          * If we have task_ctx != NULL we only notify the task context itself.
7270          * The task_ctx is set only for EXIT events before releasing task
7271          * context.
7272          */
7273         if (task_ctx) {
7274                 perf_iterate_ctx(task_ctx, output, data, false);
7275                 goto done;
7276         }
7277
7278         perf_iterate_sb_cpu(output, data);
7279
7280         for_each_task_context_nr(ctxn) {
7281                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7282                 if (ctx)
7283                         perf_iterate_ctx(ctx, output, data, false);
7284         }
7285 done:
7286         preempt_enable();
7287         rcu_read_unlock();
7288 }
7289
7290 /*
7291  * Clear all file-based filters at exec, they'll have to be
7292  * re-instated when/if these objects are mmapped again.
7293  */
7294 static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
7295 {
7296         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
7297         struct perf_addr_filter *filter;
7298         unsigned int restart = 0, count = 0;
7299         unsigned long flags;
7300
7301         if (!has_addr_filter(event))
7302                 return;
7303
7304         raw_spin_lock_irqsave(&ifh->lock, flags);
7305         list_for_each_entry(filter, &ifh->list, entry) {
7306                 if (filter->path.dentry) {
7307                         event->addr_filter_ranges[count].start = 0;
7308                         event->addr_filter_ranges[count].size = 0;
7309                         restart++;
7310                 }
7311
7312                 count++;
7313         }
7314
7315         if (restart)
7316                 event->addr_filters_gen++;
7317         raw_spin_unlock_irqrestore(&ifh->lock, flags);
7318
7319         if (restart)
7320                 perf_event_stop(event, 1);
7321 }
7322
7323 void perf_event_exec(void)
7324 {
7325         struct perf_event_context *ctx;
7326         int ctxn;
7327
7328         rcu_read_lock();
7329         for_each_task_context_nr(ctxn) {
7330                 ctx = current->perf_event_ctxp[ctxn];
7331                 if (!ctx)
7332                         continue;
7333
7334                 perf_event_enable_on_exec(ctxn);
7335
7336                 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
7337                                    true);
7338         }
7339         rcu_read_unlock();
7340 }
7341
7342 struct remote_output {
7343         struct perf_buffer      *rb;
7344         int                     err;
7345 };
7346
7347 static void __perf_event_output_stop(struct perf_event *event, void *data)
7348 {
7349         struct perf_event *parent = event->parent;
7350         struct remote_output *ro = data;
7351         struct perf_buffer *rb = ro->rb;
7352         struct stop_event_data sd = {
7353                 .event  = event,
7354         };
7355
7356         if (!has_aux(event))
7357                 return;
7358
7359         if (!parent)
7360                 parent = event;
7361
7362         /*
7363          * In case of inheritance, it will be the parent that links to the
7364          * ring-buffer, but it will be the child that's actually using it.
7365          *
7366          * We are using event::rb to determine if the event should be stopped,
7367          * however this may race with ring_buffer_attach() (through set_output),
7368          * which will make us skip the event that actually needs to be stopped.
7369          * So ring_buffer_attach() has to stop an aux event before re-assigning
7370          * its rb pointer.
7371          */
7372         if (rcu_dereference(parent->rb) == rb)
7373                 ro->err = __perf_event_stop(&sd);
7374 }
7375
7376 static int __perf_pmu_output_stop(void *info)
7377 {
7378         struct perf_event *event = info;
7379         struct pmu *pmu = event->ctx->pmu;
7380         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
7381         struct remote_output ro = {
7382                 .rb     = event->rb,
7383         };
7384
7385         rcu_read_lock();
7386         perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
7387         if (cpuctx->task_ctx)
7388                 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
7389                                    &ro, false);
7390         rcu_read_unlock();
7391
7392         return ro.err;
7393 }
7394
7395 static void perf_pmu_output_stop(struct perf_event *event)
7396 {
7397         struct perf_event *iter;
7398         int err, cpu;
7399
7400 restart:
7401         rcu_read_lock();
7402         list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
7403                 /*
7404                  * For per-CPU events, we need to make sure that neither they
7405                  * nor their children are running; for cpu==-1 events it's
7406                  * sufficient to stop the event itself if it's active, since
7407                  * it can't have children.
7408                  */
7409                 cpu = iter->cpu;
7410                 if (cpu == -1)
7411                         cpu = READ_ONCE(iter->oncpu);
7412
7413                 if (cpu == -1)
7414                         continue;
7415
7416                 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
7417                 if (err == -EAGAIN) {
7418                         rcu_read_unlock();
7419                         goto restart;
7420                 }
7421         }
7422         rcu_read_unlock();
7423 }
7424
7425 /*
7426  * task tracking -- fork/exit
7427  *
7428  * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
7429  */
7430
7431 struct perf_task_event {
7432         struct task_struct              *task;
7433         struct perf_event_context       *task_ctx;
7434
7435         struct {
7436                 struct perf_event_header        header;
7437
7438                 u32                             pid;
7439                 u32                             ppid;
7440                 u32                             tid;
7441                 u32                             ptid;
7442                 u64                             time;
7443         } event_id;
7444 };
7445
7446 static int perf_event_task_match(struct perf_event *event)
7447 {
7448         return event->attr.comm  || event->attr.mmap ||
7449                event->attr.mmap2 || event->attr.mmap_data ||
7450                event->attr.task;
7451 }
7452
7453 static void perf_event_task_output(struct perf_event *event,
7454                                    void *data)
7455 {
7456         struct perf_task_event *task_event = data;
7457         struct perf_output_handle handle;
7458         struct perf_sample_data sample;
7459         struct task_struct *task = task_event->task;
7460         int ret, size = task_event->event_id.header.size;
7461
7462         if (!perf_event_task_match(event))
7463                 return;
7464
7465         perf_event_header__init_id(&task_event->event_id.header, &sample, event);
7466
7467         ret = perf_output_begin(&handle, event,
7468                                 task_event->event_id.header.size);
7469         if (ret)
7470                 goto out;
7471
7472         task_event->event_id.pid = perf_event_pid(event, task);
7473         task_event->event_id.ppid = perf_event_pid(event, current);
7474
7475         task_event->event_id.tid = perf_event_tid(event, task);
7476         task_event->event_id.ptid = perf_event_tid(event, current);
7477
7478         task_event->event_id.time = perf_event_clock(event);
7479
7480         perf_output_put(&handle, task_event->event_id);
7481
7482         perf_event__output_id_sample(event, &handle, &sample);
7483
7484         perf_output_end(&handle);
7485 out:
7486         task_event->event_id.header.size = size;
7487 }
7488
7489 static void perf_event_task(struct task_struct *task,
7490                               struct perf_event_context *task_ctx,
7491                               int new)
7492 {
7493         struct perf_task_event task_event;
7494
7495         if (!atomic_read(&nr_comm_events) &&
7496             !atomic_read(&nr_mmap_events) &&
7497             !atomic_read(&nr_task_events))
7498                 return;
7499
7500         task_event = (struct perf_task_event){
7501                 .task     = task,
7502                 .task_ctx = task_ctx,
7503                 .event_id    = {
7504                         .header = {
7505                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
7506                                 .misc = 0,
7507                                 .size = sizeof(task_event.event_id),
7508                         },
7509                         /* .pid  */
7510                         /* .ppid */
7511                         /* .tid  */
7512                         /* .ptid */
7513                         /* .time */
7514                 },
7515         };
7516
7517         perf_iterate_sb(perf_event_task_output,
7518                        &task_event,
7519                        task_ctx);
7520 }
7521
7522 void perf_event_fork(struct task_struct *task)
7523 {
7524         perf_event_task(task, NULL, 1);
7525         perf_event_namespaces(task);
7526 }
7527
7528 /*
7529  * comm tracking
7530  */
7531
7532 struct perf_comm_event {
7533         struct task_struct      *task;
7534         char                    *comm;
7535         int                     comm_size;
7536
7537         struct {
7538                 struct perf_event_header        header;
7539
7540                 u32                             pid;
7541                 u32                             tid;
7542         } event_id;
7543 };
7544
7545 static int perf_event_comm_match(struct perf_event *event)
7546 {
7547         return event->attr.comm;
7548 }
7549
7550 static void perf_event_comm_output(struct perf_event *event,
7551                                    void *data)
7552 {
7553         struct perf_comm_event *comm_event = data;
7554         struct perf_output_handle handle;
7555         struct perf_sample_data sample;
7556         int size = comm_event->event_id.header.size;
7557         int ret;
7558
7559         if (!perf_event_comm_match(event))
7560                 return;
7561
7562         perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
7563         ret = perf_output_begin(&handle, event,
7564                                 comm_event->event_id.header.size);
7565
7566         if (ret)
7567                 goto out;
7568
7569         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
7570         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
7571
7572         perf_output_put(&handle, comm_event->event_id);
7573         __output_copy(&handle, comm_event->comm,
7574                                    comm_event->comm_size);
7575
7576         perf_event__output_id_sample(event, &handle, &sample);
7577
7578         perf_output_end(&handle);
7579 out:
7580         comm_event->event_id.header.size = size;
7581 }
7582
7583 static void perf_event_comm_event(struct perf_comm_event *comm_event)
7584 {
7585         char comm[TASK_COMM_LEN];
7586         unsigned int size;
7587
7588         memset(comm, 0, sizeof(comm));
7589         strlcpy(comm, comm_event->task->comm, sizeof(comm));
7590         size = ALIGN(strlen(comm)+1, sizeof(u64));
7591
7592         comm_event->comm = comm;
7593         comm_event->comm_size = size;
7594
7595         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
7596
7597         perf_iterate_sb(perf_event_comm_output,
7598                        comm_event,
7599                        NULL);
7600 }
7601
7602 void perf_event_comm(struct task_struct *task, bool exec)
7603 {
7604         struct perf_comm_event comm_event;
7605
7606         if (!atomic_read(&nr_comm_events))
7607                 return;
7608
7609         comm_event = (struct perf_comm_event){
7610                 .task   = task,
7611                 /* .comm      */
7612                 /* .comm_size */
7613                 .event_id  = {
7614                         .header = {
7615                                 .type = PERF_RECORD_COMM,
7616                                 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
7617                                 /* .size */
7618                         },
7619                         /* .pid */
7620                         /* .tid */
7621                 },
7622         };
7623
7624         perf_event_comm_event(&comm_event);
7625 }
7626
7627 /*
7628  * namespaces tracking
7629  */
7630
7631 struct perf_namespaces_event {
7632         struct task_struct              *task;
7633
7634         struct {
7635                 struct perf_event_header        header;
7636
7637                 u32                             pid;
7638                 u32                             tid;
7639                 u64                             nr_namespaces;
7640                 struct perf_ns_link_info        link_info[NR_NAMESPACES];
7641         } event_id;
7642 };
7643
7644 static int perf_event_namespaces_match(struct perf_event *event)
7645 {
7646         return event->attr.namespaces;
7647 }
7648
7649 static void perf_event_namespaces_output(struct perf_event *event,
7650                                          void *data)
7651 {
7652         struct perf_namespaces_event *namespaces_event = data;
7653         struct perf_output_handle handle;
7654         struct perf_sample_data sample;
7655         u16 header_size = namespaces_event->event_id.header.size;
7656         int ret;
7657
7658         if (!perf_event_namespaces_match(event))
7659                 return;
7660
7661         perf_event_header__init_id(&namespaces_event->event_id.header,
7662                                    &sample, event);
7663         ret = perf_output_begin(&handle, event,
7664                                 namespaces_event->event_id.header.size);
7665         if (ret)
7666                 goto out;
7667
7668         namespaces_event->event_id.pid = perf_event_pid(event,
7669                                                         namespaces_event->task);
7670         namespaces_event->event_id.tid = perf_event_tid(event,
7671                                                         namespaces_event->task);
7672
7673         perf_output_put(&handle, namespaces_event->event_id);
7674
7675         perf_event__output_id_sample(event, &handle, &sample);
7676
7677         perf_output_end(&handle);
7678 out:
7679         namespaces_event->event_id.header.size = header_size;
7680 }
7681
7682 static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
7683                                    struct task_struct *task,
7684                                    const struct proc_ns_operations *ns_ops)
7685 {
7686         struct path ns_path;
7687         struct inode *ns_inode;
7688         int error;
7689
7690         error = ns_get_path(&ns_path, task, ns_ops);
7691         if (!error) {
7692                 ns_inode = ns_path.dentry->d_inode;
7693                 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
7694                 ns_link_info->ino = ns_inode->i_ino;
7695                 path_put(&ns_path);
7696         }
7697 }
7698
7699 void perf_event_namespaces(struct task_struct *task)
7700 {
7701         struct perf_namespaces_event namespaces_event;
7702         struct perf_ns_link_info *ns_link_info;
7703
7704         if (!atomic_read(&nr_namespaces_events))
7705                 return;
7706
7707         namespaces_event = (struct perf_namespaces_event){
7708                 .task   = task,
7709                 .event_id  = {
7710                         .header = {
7711                                 .type = PERF_RECORD_NAMESPACES,
7712                                 .misc = 0,
7713                                 .size = sizeof(namespaces_event.event_id),
7714                         },
7715                         /* .pid */
7716                         /* .tid */
7717                         .nr_namespaces = NR_NAMESPACES,
7718                         /* .link_info[NR_NAMESPACES] */
7719                 },
7720         };
7721
7722         ns_link_info = namespaces_event.event_id.link_info;
7723
7724         perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
7725                                task, &mntns_operations);
7726
7727 #ifdef CONFIG_USER_NS
7728         perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
7729                                task, &userns_operations);
7730 #endif
7731 #ifdef CONFIG_NET_NS
7732         perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
7733                                task, &netns_operations);
7734 #endif
7735 #ifdef CONFIG_UTS_NS
7736         perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
7737                                task, &utsns_operations);
7738 #endif
7739 #ifdef CONFIG_IPC_NS
7740         perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
7741                                task, &ipcns_operations);
7742 #endif
7743 #ifdef CONFIG_PID_NS
7744         perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
7745                                task, &pidns_operations);
7746 #endif
7747 #ifdef CONFIG_CGROUPS
7748         perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
7749                                task, &cgroupns_operations);
7750 #endif
7751
7752         perf_iterate_sb(perf_event_namespaces_output,
7753                         &namespaces_event,
7754                         NULL);
7755 }
7756
7757 /*
7758  * cgroup tracking
7759  */
7760 #ifdef CONFIG_CGROUP_PERF
7761
7762 struct perf_cgroup_event {
7763         char                            *path;
7764         int                             path_size;
7765         struct {
7766                 struct perf_event_header        header;
7767                 u64                             id;
7768                 char                            path[];
7769         } event_id;
7770 };
7771
7772 static int perf_event_cgroup_match(struct perf_event *event)
7773 {
7774         return event->attr.cgroup;
7775 }
7776
7777 static void perf_event_cgroup_output(struct perf_event *event, void *data)
7778 {
7779         struct perf_cgroup_event *cgroup_event = data;
7780         struct perf_output_handle handle;
7781         struct perf_sample_data sample;
7782         u16 header_size = cgroup_event->event_id.header.size;
7783         int ret;
7784
7785         if (!perf_event_cgroup_match(event))
7786                 return;
7787
7788         perf_event_header__init_id(&cgroup_event->event_id.header,
7789                                    &sample, event);
7790         ret = perf_output_begin(&handle, event,
7791                                 cgroup_event->event_id.header.size);
7792         if (ret)
7793                 goto out;
7794
7795         perf_output_put(&handle, cgroup_event->event_id);
7796         __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
7797
7798         perf_event__output_id_sample(event, &handle, &sample);
7799
7800         perf_output_end(&handle);
7801 out:
7802         cgroup_event->event_id.header.size = header_size;
7803 }
7804
7805 static void perf_event_cgroup(struct cgroup *cgrp)
7806 {
7807         struct perf_cgroup_event cgroup_event;
7808         char path_enomem[16] = "//enomem";
7809         char *pathname;
7810         size_t size;
7811
7812         if (!atomic_read(&nr_cgroup_events))
7813                 return;
7814
7815         cgroup_event = (struct perf_cgroup_event){
7816                 .event_id  = {
7817                         .header = {
7818                                 .type = PERF_RECORD_CGROUP,
7819                                 .misc = 0,
7820                                 .size = sizeof(cgroup_event.event_id),
7821                         },
7822                         .id = cgroup_id(cgrp),
7823                 },
7824         };
7825
7826         pathname = kmalloc(PATH_MAX, GFP_KERNEL);
7827         if (pathname == NULL) {
7828                 cgroup_event.path = path_enomem;
7829         } else {
7830                 /* just to be sure to have enough space for alignment */
7831                 cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
7832                 cgroup_event.path = pathname;
7833         }
7834
7835         /*
7836          * Since our buffer works in 8 byte units we need to align our string
7837          * size to a multiple of 8. However, we must guarantee the tail end is
7838          * zero'd out to avoid leaking random bits to userspace.
7839          */
7840         size = strlen(cgroup_event.path) + 1;
7841         while (!IS_ALIGNED(size, sizeof(u64)))
7842                 cgroup_event.path[size++] = '\0';
7843
7844         cgroup_event.event_id.header.size += size;
7845         cgroup_event.path_size = size;
7846
7847         perf_iterate_sb(perf_event_cgroup_output,
7848                         &cgroup_event,
7849                         NULL);
7850
7851         kfree(pathname);
7852 }
7853
7854 #endif
7855
7856 /*
7857  * mmap tracking
7858  */
7859
7860 struct perf_mmap_event {
7861         struct vm_area_struct   *vma;
7862
7863         const char              *file_name;
7864         int                     file_size;
7865         int                     maj, min;
7866         u64                     ino;
7867         u64                     ino_generation;
7868         u32                     prot, flags;
7869
7870         struct {
7871                 struct perf_event_header        header;
7872
7873                 u32                             pid;
7874                 u32                             tid;
7875                 u64                             start;
7876                 u64                             len;
7877                 u64                             pgoff;
7878         } event_id;
7879 };
7880
7881 static int perf_event_mmap_match(struct perf_event *event,
7882                                  void *data)
7883 {
7884         struct perf_mmap_event *mmap_event = data;
7885         struct vm_area_struct *vma = mmap_event->vma;
7886         int executable = vma->vm_flags & VM_EXEC;
7887
7888         return (!executable && event->attr.mmap_data) ||
7889                (executable && (event->attr.mmap || event->attr.mmap2));
7890 }
7891
7892 static void perf_event_mmap_output(struct perf_event *event,
7893                                    void *data)
7894 {
7895         struct perf_mmap_event *mmap_event = data;
7896         struct perf_output_handle handle;
7897         struct perf_sample_data sample;
7898         int size = mmap_event->event_id.header.size;
7899         u32 type = mmap_event->event_id.header.type;
7900         int ret;
7901
7902         if (!perf_event_mmap_match(event, data))
7903                 return;
7904
7905         if (event->attr.mmap2) {
7906                 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
7907                 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
7908                 mmap_event->event_id.header.size += sizeof(mmap_event->min);
7909                 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
7910                 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
7911                 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
7912                 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
7913         }
7914
7915         perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
7916         ret = perf_output_begin(&handle, event,
7917                                 mmap_event->event_id.header.size);
7918         if (ret)
7919                 goto out;
7920
7921         mmap_event->event_id.pid = perf_event_pid(event, current);
7922         mmap_event->event_id.tid = perf_event_tid(event, current);
7923
7924         perf_output_put(&handle, mmap_event->event_id);
7925
7926         if (event->attr.mmap2) {
7927                 perf_output_put(&handle, mmap_event->maj);
7928                 perf_output_put(&handle, mmap_event->min);
7929                 perf_output_put(&handle, mmap_event->ino);
7930                 perf_output_put(&handle, mmap_event->ino_generation);
7931                 perf_output_put(&handle, mmap_event->prot);
7932                 perf_output_put(&handle, mmap_event->flags);
7933         }
7934
7935         __output_copy(&handle, mmap_event->file_name,
7936                                    mmap_event->file_size);
7937
7938         perf_event__output_id_sample(event, &handle, &sample);
7939
7940         perf_output_end(&handle);
7941 out:
7942         mmap_event->event_id.header.size = size;
7943         mmap_event->event_id.header.type = type;
7944 }
7945
7946 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
7947 {
7948         struct vm_area_struct *vma = mmap_event->vma;
7949         struct file *file = vma->vm_file;
7950         int maj = 0, min = 0;
7951         u64 ino = 0, gen = 0;
7952         u32 prot = 0, flags = 0;
7953         unsigned int size;
7954         char tmp[16];
7955         char *buf = NULL;
7956         char *name;
7957
7958         if (vma->vm_flags & VM_READ)
7959                 prot |= PROT_READ;
7960         if (vma->vm_flags & VM_WRITE)
7961                 prot |= PROT_WRITE;
7962         if (vma->vm_flags & VM_EXEC)
7963                 prot |= PROT_EXEC;
7964
7965         if (vma->vm_flags & VM_MAYSHARE)
7966                 flags = MAP_SHARED;
7967         else
7968                 flags = MAP_PRIVATE;
7969
7970         if (vma->vm_flags & VM_DENYWRITE)
7971                 flags |= MAP_DENYWRITE;
7972         if (vma->vm_flags & VM_MAYEXEC)
7973                 flags |= MAP_EXECUTABLE;
7974         if (vma->vm_flags & VM_LOCKED)
7975                 flags |= MAP_LOCKED;
7976         if (vma->vm_flags & VM_HUGETLB)
7977                 flags |= MAP_HUGETLB;
7978
7979         if (file) {
7980                 struct inode *inode;
7981                 dev_t dev;
7982
7983                 buf = kmalloc(PATH_MAX, GFP_KERNEL);
7984                 if (!buf) {
7985                         name = "//enomem";
7986                         goto cpy_name;
7987                 }
7988                 /*
7989                  * d_path() works from the end of the rb backwards, so we
7990                  * need to add enough zero bytes after the string to handle
7991                  * the 64bit alignment we do later.
7992                  */
7993                 name = file_path(file, buf, PATH_MAX - sizeof(u64));
7994                 if (IS_ERR(name)) {
7995                         name = "//toolong";
7996                         goto cpy_name;
7997                 }
7998                 inode = file_inode(vma->vm_file);
7999                 dev = inode->i_sb->s_dev;
8000                 ino = inode->i_ino;
8001                 gen = inode->i_generation;
8002                 maj = MAJOR(dev);
8003                 min = MINOR(dev);
8004
8005                 goto got_name;
8006         } else {
8007                 if (vma->vm_ops && vma->vm_ops->name) {
8008                         name = (char *) vma->vm_ops->name(vma);
8009                         if (name)
8010                                 goto cpy_name;
8011                 }
8012
8013                 name = (char *)arch_vma_name(vma);
8014                 if (name)
8015                         goto cpy_name;
8016
8017                 if (vma->vm_start <= vma->vm_mm->start_brk &&
8018                                 vma->vm_end >= vma->vm_mm->brk) {
8019                         name = "[heap]";
8020                         goto cpy_name;
8021                 }
8022                 if (vma->vm_start <= vma->vm_mm->start_stack &&
8023                                 vma->vm_end >= vma->vm_mm->start_stack) {
8024                         name = "[stack]";
8025                         goto cpy_name;
8026                 }
8027
8028                 name = "//anon";
8029                 goto cpy_name;
8030         }
8031
8032 cpy_name:
8033         strlcpy(tmp, name, sizeof(tmp));
8034         name = tmp;
8035 got_name:
8036         /*
8037          * Since our buffer works in 8 byte units we need to align our string
8038          * size to a multiple of 8. However, we must guarantee the tail end is
8039          * zero'd out to avoid leaking random bits to userspace.
8040          */
8041         size = strlen(name)+1;
8042         while (!IS_ALIGNED(size, sizeof(u64)))
8043                 name[size++] = '\0';
8044
8045         mmap_event->file_name = name;
8046         mmap_event->file_size = size;
8047         mmap_event->maj = maj;
8048         mmap_event->min = min;
8049         mmap_event->ino = ino;
8050         mmap_event->ino_generation = gen;
8051         mmap_event->prot = prot;
8052         mmap_event->flags = flags;
8053
8054         if (!(vma->vm_flags & VM_EXEC))
8055                 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
8056
8057         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
8058
8059         perf_iterate_sb(perf_event_mmap_output,
8060                        mmap_event,
8061                        NULL);
8062
8063         kfree(buf);
8064 }
8065
8066 /*
8067  * Check whether inode and address range match filter criteria.
8068  */
8069 static bool perf_addr_filter_match(struct perf_addr_filter *filter,
8070                                      struct file *file, unsigned long offset,
8071                                      unsigned long size)
8072 {
8073         /* d_inode(NULL) won't be equal to any mapped user-space file */
8074         if (!filter->path.dentry)
8075                 return false;
8076
8077         if (d_inode(filter->path.dentry) != file_inode(file))
8078                 return false;
8079
8080         if (filter->offset > offset + size)
8081                 return false;
8082
8083         if (filter->offset + filter->size < offset)
8084                 return false;
8085
8086         return true;
8087 }
8088
8089 static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
8090                                         struct vm_area_struct *vma,
8091                                         struct perf_addr_filter_range *fr)
8092 {
8093         unsigned long vma_size = vma->vm_end - vma->vm_start;
8094         unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
8095         struct file *file = vma->vm_file;
8096
8097         if (!perf_addr_filter_match(filter, file, off, vma_size))
8098                 return false;
8099
8100         if (filter->offset < off) {
8101                 fr->start = vma->vm_start;
8102                 fr->size = min(vma_size, filter->size - (off - filter->offset));
8103         } else {
8104                 fr->start = vma->vm_start + filter->offset - off;
8105                 fr->size = min(vma->vm_end - fr->start, filter->size);
8106         }
8107
8108         return true;
8109 }
8110
8111 static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
8112 {
8113         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8114         struct vm_area_struct *vma = data;
8115         struct perf_addr_filter *filter;
8116         unsigned int restart = 0, count = 0;
8117         unsigned long flags;
8118
8119         if (!has_addr_filter(event))
8120                 return;
8121
8122         if (!vma->vm_file)
8123                 return;
8124
8125         raw_spin_lock_irqsave(&ifh->lock, flags);
8126         list_for_each_entry(filter, &ifh->list, entry) {
8127                 if (perf_addr_filter_vma_adjust(filter, vma,
8128                                                 &event->addr_filter_ranges[count]))
8129                         restart++;
8130
8131                 count++;
8132         }
8133
8134         if (restart)
8135                 event->addr_filters_gen++;
8136         raw_spin_unlock_irqrestore(&ifh->lock, flags);
8137
8138         if (restart)
8139                 perf_event_stop(event, 1);
8140 }
8141
8142 /*
8143  * Adjust all task's events' filters to the new vma
8144  */
8145 static void perf_addr_filters_adjust(struct vm_area_struct *vma)
8146 {
8147         struct perf_event_context *ctx;
8148         int ctxn;
8149
8150         /*
8151          * Data tracing isn't supported yet and as such there is no need
8152          * to keep track of anything that isn't related to executable code:
8153          */
8154         if (!(vma->vm_flags & VM_EXEC))
8155                 return;
8156
8157         rcu_read_lock();
8158         for_each_task_context_nr(ctxn) {
8159                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
8160                 if (!ctx)
8161                         continue;
8162
8163                 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
8164         }
8165         rcu_read_unlock();
8166 }
8167
8168 void perf_event_mmap(struct vm_area_struct *vma)
8169 {
8170         struct perf_mmap_event mmap_event;
8171
8172         if (!atomic_read(&nr_mmap_events))
8173                 return;
8174
8175         mmap_event = (struct perf_mmap_event){
8176                 .vma    = vma,
8177                 /* .file_name */
8178                 /* .file_size */
8179                 .event_id  = {
8180                         .header = {
8181                                 .type = PERF_RECORD_MMAP,
8182                                 .misc = PERF_RECORD_MISC_USER,
8183                                 /* .size */
8184                         },
8185                         /* .pid */
8186                         /* .tid */
8187                         .start  = vma->vm_start,
8188                         .len    = vma->vm_end - vma->vm_start,
8189                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
8190                 },
8191                 /* .maj (attr_mmap2 only) */
8192                 /* .min (attr_mmap2 only) */
8193                 /* .ino (attr_mmap2 only) */
8194                 /* .ino_generation (attr_mmap2 only) */
8195                 /* .prot (attr_mmap2 only) */
8196                 /* .flags (attr_mmap2 only) */
8197         };
8198
8199         perf_addr_filters_adjust(vma);
8200         perf_event_mmap_event(&mmap_event);
8201 }
8202
8203 void perf_event_aux_event(struct perf_event *event, unsigned long head,
8204                           unsigned long size, u64 flags)
8205 {
8206         struct perf_output_handle handle;
8207         struct perf_sample_data sample;
8208         struct perf_aux_event {
8209                 struct perf_event_header        header;
8210                 u64                             offset;
8211                 u64                             size;
8212                 u64                             flags;
8213         } rec = {
8214                 .header = {
8215                         .type = PERF_RECORD_AUX,
8216                         .misc = 0,
8217                         .size = sizeof(rec),
8218                 },
8219                 .offset         = head,
8220                 .size           = size,
8221                 .flags          = flags,
8222         };
8223         int ret;
8224
8225         perf_event_header__init_id(&rec.header, &sample, event);
8226         ret = perf_output_begin(&handle, event, rec.header.size);
8227
8228         if (ret)
8229                 return;
8230
8231         perf_output_put(&handle, rec);
8232         perf_event__output_id_sample(event, &handle, &sample);
8233
8234         perf_output_end(&handle);
8235 }
8236
8237 /*
8238  * Lost/dropped samples logging
8239  */
8240 void perf_log_lost_samples(struct perf_event *event, u64 lost)
8241 {
8242         struct perf_output_handle handle;
8243         struct perf_sample_data sample;
8244         int ret;
8245
8246         struct {
8247                 struct perf_event_header        header;
8248                 u64                             lost;
8249         } lost_samples_event = {
8250                 .header = {
8251                         .type = PERF_RECORD_LOST_SAMPLES,
8252                         .misc = 0,
8253                         .size = sizeof(lost_samples_event),
8254                 },
8255                 .lost           = lost,
8256         };
8257
8258         perf_event_header__init_id(&lost_samples_event.header, &sample, event);
8259
8260         ret = perf_output_begin(&handle, event,
8261                                 lost_samples_event.header.size);
8262         if (ret)
8263                 return;
8264
8265         perf_output_put(&handle, lost_samples_event);
8266         perf_event__output_id_sample(event, &handle, &sample);
8267         perf_output_end(&handle);
8268 }
8269
8270 /*
8271  * context_switch tracking
8272  */
8273
8274 struct perf_switch_event {
8275         struct task_struct      *task;
8276         struct task_struct      *next_prev;
8277
8278         struct {
8279                 struct perf_event_header        header;
8280                 u32                             next_prev_pid;
8281                 u32                             next_prev_tid;
8282         } event_id;
8283 };
8284
8285 static int perf_event_switch_match(struct perf_event *event)
8286 {
8287         return event->attr.context_switch;
8288 }
8289
8290 static void perf_event_switch_output(struct perf_event *event, void *data)
8291 {
8292         struct perf_switch_event *se = data;
8293         struct perf_output_handle handle;
8294         struct perf_sample_data sample;
8295         int ret;
8296
8297         if (!perf_event_switch_match(event))
8298                 return;
8299
8300         /* Only CPU-wide events are allowed to see next/prev pid/tid */
8301         if (event->ctx->task) {
8302                 se->event_id.header.type = PERF_RECORD_SWITCH;
8303                 se->event_id.header.size = sizeof(se->event_id.header);
8304         } else {
8305                 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
8306                 se->event_id.header.size = sizeof(se->event_id);
8307                 se->event_id.next_prev_pid =
8308                                         perf_event_pid(event, se->next_prev);
8309                 se->event_id.next_prev_tid =
8310                                         perf_event_tid(event, se->next_prev);
8311         }
8312
8313         perf_event_header__init_id(&se->event_id.header, &sample, event);
8314
8315         ret = perf_output_begin(&handle, event, se->event_id.header.size);
8316         if (ret)
8317                 return;
8318
8319         if (event->ctx->task)
8320                 perf_output_put(&handle, se->event_id.header);
8321         else
8322                 perf_output_put(&handle, se->event_id);
8323
8324         perf_event__output_id_sample(event, &handle, &sample);
8325
8326         perf_output_end(&handle);
8327 }
8328
8329 static void perf_event_switch(struct task_struct *task,
8330                               struct task_struct *next_prev, bool sched_in)
8331 {
8332         struct perf_switch_event switch_event;
8333
8334         /* N.B. caller checks nr_switch_events != 0 */
8335
8336         switch_event = (struct perf_switch_event){
8337                 .task           = task,
8338                 .next_prev      = next_prev,
8339                 .event_id       = {
8340                         .header = {
8341                                 /* .type */
8342                                 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
8343                                 /* .size */
8344                         },
8345                         /* .next_prev_pid */
8346                         /* .next_prev_tid */
8347                 },
8348         };
8349
8350         if (!sched_in && task->state == TASK_RUNNING)
8351                 switch_event.event_id.header.misc |=
8352                                 PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
8353
8354         perf_iterate_sb(perf_event_switch_output,
8355                        &switch_event,
8356                        NULL);
8357 }
8358
8359 /*
8360  * IRQ throttle logging
8361  */
8362
8363 static void perf_log_throttle(struct perf_event *event, int enable)
8364 {
8365         struct perf_output_handle handle;
8366         struct perf_sample_data sample;
8367         int ret;
8368
8369         struct {
8370                 struct perf_event_header        header;
8371                 u64                             time;
8372                 u64                             id;
8373                 u64                             stream_id;
8374         } throttle_event = {
8375                 .header = {
8376                         .type = PERF_RECORD_THROTTLE,
8377                         .misc = 0,
8378                         .size = sizeof(throttle_event),
8379                 },
8380                 .time           = perf_event_clock(event),
8381                 .id             = primary_event_id(event),
8382                 .stream_id      = event->id,
8383         };
8384
8385         if (enable)
8386                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
8387
8388         perf_event_header__init_id(&throttle_event.header, &sample, event);
8389
8390         ret = perf_output_begin(&handle, event,
8391                                 throttle_event.header.size);
8392         if (ret)
8393                 return;
8394
8395         perf_output_put(&handle, throttle_event);
8396         perf_event__output_id_sample(event, &handle, &sample);
8397         perf_output_end(&handle);
8398 }
8399
8400 /*
8401  * ksymbol register/unregister tracking
8402  */
8403
8404 struct perf_ksymbol_event {
8405         const char      *name;
8406         int             name_len;
8407         struct {
8408                 struct perf_event_header        header;
8409                 u64                             addr;
8410                 u32                             len;
8411                 u16                             ksym_type;
8412                 u16                             flags;
8413         } event_id;
8414 };
8415
8416 static int perf_event_ksymbol_match(struct perf_event *event)
8417 {
8418         return event->attr.ksymbol;
8419 }
8420
8421 static void perf_event_ksymbol_output(struct perf_event *event, void *data)
8422 {
8423         struct perf_ksymbol_event *ksymbol_event = data;
8424         struct perf_output_handle handle;
8425         struct perf_sample_data sample;
8426         int ret;
8427
8428         if (!perf_event_ksymbol_match(event))
8429                 return;
8430
8431         perf_event_header__init_id(&ksymbol_event->event_id.header,
8432                                    &sample, event);
8433         ret = perf_output_begin(&handle, event,
8434                                 ksymbol_event->event_id.header.size);
8435         if (ret)
8436                 return;
8437
8438         perf_output_put(&handle, ksymbol_event->event_id);
8439         __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
8440         perf_event__output_id_sample(event, &handle, &sample);
8441
8442         perf_output_end(&handle);
8443 }
8444
8445 void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
8446                         const char *sym)
8447 {
8448         struct perf_ksymbol_event ksymbol_event;
8449         char name[KSYM_NAME_LEN];
8450         u16 flags = 0;
8451         int name_len;
8452
8453         if (!atomic_read(&nr_ksymbol_events))
8454                 return;
8455
8456         if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
8457             ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
8458                 goto err;
8459
8460         strlcpy(name, sym, KSYM_NAME_LEN);
8461         name_len = strlen(name) + 1;
8462         while (!IS_ALIGNED(name_len, sizeof(u64)))
8463                 name[name_len++] = '\0';
8464         BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
8465
8466         if (unregister)
8467                 flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
8468
8469         ksymbol_event = (struct perf_ksymbol_event){
8470                 .name = name,
8471                 .name_len = name_len,
8472                 .event_id = {
8473                         .header = {
8474                                 .type = PERF_RECORD_KSYMBOL,
8475                                 .size = sizeof(ksymbol_event.event_id) +
8476                                         name_len,
8477                         },
8478                         .addr = addr,
8479                         .len = len,
8480                         .ksym_type = ksym_type,
8481                         .flags = flags,
8482                 },
8483         };
8484
8485         perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
8486         return;
8487 err:
8488         WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
8489 }
8490
8491 /*
8492  * bpf program load/unload tracking
8493  */
8494
8495 struct perf_bpf_event {
8496         struct bpf_prog *prog;
8497         struct {
8498                 struct perf_event_header        header;
8499                 u16                             type;
8500                 u16                             flags;
8501                 u32                             id;
8502                 u8                              tag[BPF_TAG_SIZE];
8503         } event_id;
8504 };
8505
8506 static int perf_event_bpf_match(struct perf_event *event)
8507 {
8508         return event->attr.bpf_event;
8509 }
8510
8511 static void perf_event_bpf_output(struct perf_event *event, void *data)
8512 {
8513         struct perf_bpf_event *bpf_event = data;
8514         struct perf_output_handle handle;
8515         struct perf_sample_data sample;
8516         int ret;
8517
8518         if (!perf_event_bpf_match(event))
8519                 return;
8520
8521         perf_event_header__init_id(&bpf_event->event_id.header,
8522                                    &sample, event);
8523         ret = perf_output_begin(&handle, event,
8524                                 bpf_event->event_id.header.size);
8525         if (ret)
8526                 return;
8527
8528         perf_output_put(&handle, bpf_event->event_id);
8529         perf_event__output_id_sample(event, &handle, &sample);
8530
8531         perf_output_end(&handle);
8532 }
8533
8534 static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
8535                                          enum perf_bpf_event_type type)
8536 {
8537         bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
8538         int i;
8539
8540         if (prog->aux->func_cnt == 0) {
8541                 perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
8542                                    (u64)(unsigned long)prog->bpf_func,
8543                                    prog->jited_len, unregister,
8544                                    prog->aux->ksym.name);
8545         } else {
8546                 for (i = 0; i < prog->aux->func_cnt; i++) {
8547                         struct bpf_prog *subprog = prog->aux->func[i];
8548
8549                         perf_event_ksymbol(
8550                                 PERF_RECORD_KSYMBOL_TYPE_BPF,
8551                                 (u64)(unsigned long)subprog->bpf_func,
8552                                 subprog->jited_len, unregister,
8553                                 prog->aux->ksym.name);
8554                 }
8555         }
8556 }
8557
8558 void perf_event_bpf_event(struct bpf_prog *prog,
8559                           enum perf_bpf_event_type type,
8560                           u16 flags)
8561 {
8562         struct perf_bpf_event bpf_event;
8563
8564         if (type <= PERF_BPF_EVENT_UNKNOWN ||
8565             type >= PERF_BPF_EVENT_MAX)
8566                 return;
8567
8568         switch (type) {
8569         case PERF_BPF_EVENT_PROG_LOAD:
8570         case PERF_BPF_EVENT_PROG_UNLOAD:
8571                 if (atomic_read(&nr_ksymbol_events))
8572                         perf_event_bpf_emit_ksymbols(prog, type);
8573                 break;
8574         default:
8575                 break;
8576         }
8577
8578         if (!atomic_read(&nr_bpf_events))
8579                 return;
8580
8581         bpf_event = (struct perf_bpf_event){
8582                 .prog = prog,
8583                 .event_id = {
8584                         .header = {
8585                                 .type = PERF_RECORD_BPF_EVENT,
8586                                 .size = sizeof(bpf_event.event_id),
8587                         },
8588                         .type = type,
8589                         .flags = flags,
8590                         .id = prog->aux->id,
8591                 },
8592         };
8593
8594         BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
8595
8596         memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
8597         perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
8598 }
8599
8600 void perf_event_itrace_started(struct perf_event *event)
8601 {
8602         event->attach_state |= PERF_ATTACH_ITRACE;
8603 }
8604
8605 static void perf_log_itrace_start(struct perf_event *event)
8606 {
8607         struct perf_output_handle handle;
8608         struct perf_sample_data sample;
8609         struct perf_aux_event {
8610                 struct perf_event_header        header;
8611                 u32                             pid;
8612                 u32                             tid;
8613         } rec;
8614         int ret;
8615
8616         if (event->parent)
8617                 event = event->parent;
8618
8619         if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
8620             event->attach_state & PERF_ATTACH_ITRACE)
8621                 return;
8622
8623         rec.header.type = PERF_RECORD_ITRACE_START;
8624         rec.header.misc = 0;
8625         rec.header.size = sizeof(rec);
8626         rec.pid = perf_event_pid(event, current);
8627         rec.tid = perf_event_tid(event, current);
8628
8629         perf_event_header__init_id(&rec.header, &sample, event);
8630         ret = perf_output_begin(&handle, event, rec.header.size);
8631
8632         if (ret)
8633                 return;
8634
8635         perf_output_put(&handle, rec);
8636         perf_event__output_id_sample(event, &handle, &sample);
8637
8638         perf_output_end(&handle);
8639 }
8640
8641 static int
8642 __perf_event_account_interrupt(struct perf_event *event, int throttle)
8643 {
8644         struct hw_perf_event *hwc = &event->hw;
8645         int ret = 0;
8646         u64 seq;
8647
8648         seq = __this_cpu_read(perf_throttled_seq);
8649         if (seq != hwc->interrupts_seq) {
8650                 hwc->interrupts_seq = seq;
8651                 hwc->interrupts = 1;
8652         } else {
8653                 hwc->interrupts++;
8654                 if (unlikely(throttle
8655                              && hwc->interrupts >= max_samples_per_tick)) {
8656                         __this_cpu_inc(perf_throttled_count);
8657                         tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
8658                         hwc->interrupts = MAX_INTERRUPTS;
8659                         perf_log_throttle(event, 0);
8660                         ret = 1;
8661                 }
8662         }
8663
8664         if (event->attr.freq) {
8665                 u64 now = perf_clock();
8666                 s64 delta = now - hwc->freq_time_stamp;
8667
8668                 hwc->freq_time_stamp = now;
8669
8670                 if (delta > 0 && delta < 2*TICK_NSEC)
8671                         perf_adjust_period(event, delta, hwc->last_period, true);
8672         }
8673
8674         return ret;
8675 }
8676
8677 int perf_event_account_interrupt(struct perf_event *event)
8678 {
8679         return __perf_event_account_interrupt(event, 1);
8680 }
8681
8682 /*
8683  * Generic event overflow handling, sampling.
8684  */
8685
8686 static int __perf_event_overflow(struct perf_event *event,
8687                                    int throttle, struct perf_sample_data *data,
8688                                    struct pt_regs *regs)
8689 {
8690         int events = atomic_read(&event->event_limit);
8691         int ret = 0;
8692
8693         /*
8694          * Non-sampling counters might still use the PMI to fold short
8695          * hardware counters, ignore those.
8696          */
8697         if (unlikely(!is_sampling_event(event)))
8698                 return 0;
8699
8700         ret = __perf_event_account_interrupt(event, throttle);
8701
8702         /*
8703          * XXX event_limit might not quite work as expected on inherited
8704          * events
8705          */
8706
8707         event->pending_kill = POLL_IN;
8708         if (events && atomic_dec_and_test(&event->event_limit)) {
8709                 ret = 1;
8710                 event->pending_kill = POLL_HUP;
8711
8712                 perf_event_disable_inatomic(event);
8713         }
8714
8715         READ_ONCE(event->overflow_handler)(event, data, regs);
8716
8717         if (*perf_event_fasync(event) && event->pending_kill) {
8718                 event->pending_wakeup = 1;
8719                 irq_work_queue(&event->pending);
8720         }
8721
8722         return ret;
8723 }
8724
8725 int perf_event_overflow(struct perf_event *event,
8726                           struct perf_sample_data *data,
8727                           struct pt_regs *regs)
8728 {
8729         return __perf_event_overflow(event, 1, data, regs);
8730 }
8731
8732 /*
8733  * Generic software event infrastructure
8734  */
8735
8736 struct swevent_htable {
8737         struct swevent_hlist            *swevent_hlist;
8738         struct mutex                    hlist_mutex;
8739         int                             hlist_refcount;
8740
8741         /* Recursion avoidance in each contexts */
8742         int                             recursion[PERF_NR_CONTEXTS];
8743 };
8744
8745 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
8746
8747 /*
8748  * We directly increment event->count and keep a second value in
8749  * event->hw.period_left to count intervals. This period event
8750  * is kept in the range [-sample_period, 0] so that we can use the
8751  * sign as trigger.
8752  */
8753
8754 u64 perf_swevent_set_period(struct perf_event *event)
8755 {
8756         struct hw_perf_event *hwc = &event->hw;
8757         u64 period = hwc->last_period;
8758         u64 nr, offset;
8759         s64 old, val;
8760
8761         hwc->last_period = hwc->sample_period;
8762
8763 again:
8764         old = val = local64_read(&hwc->period_left);
8765         if (val < 0)
8766                 return 0;
8767
8768         nr = div64_u64(period + val, period);
8769         offset = nr * period;
8770         val -= offset;
8771         if (local64_cmpxchg(&hwc->period_left, old, val) != old)
8772                 goto again;
8773
8774         return nr;
8775 }
8776
8777 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
8778                                     struct perf_sample_data *data,
8779                                     struct pt_regs *regs)
8780 {
8781         struct hw_perf_event *hwc = &event->hw;
8782         int throttle = 0;
8783
8784         if (!overflow)
8785                 overflow = perf_swevent_set_period(event);
8786
8787         if (hwc->interrupts == MAX_INTERRUPTS)
8788                 return;
8789
8790         for (; overflow; overflow--) {
8791                 if (__perf_event_overflow(event, throttle,
8792                                             data, regs)) {
8793                         /*
8794                          * We inhibit the overflow from happening when
8795                          * hwc->interrupts == MAX_INTERRUPTS.
8796                          */
8797                         break;
8798                 }
8799                 throttle = 1;
8800         }
8801 }
8802
8803 static void perf_swevent_event(struct perf_event *event, u64 nr,
8804                                struct perf_sample_data *data,
8805                                struct pt_regs *regs)
8806 {
8807         struct hw_perf_event *hwc = &event->hw;
8808
8809         local64_add(nr, &event->count);
8810
8811         if (!regs)
8812                 return;
8813
8814         if (!is_sampling_event(event))
8815                 return;
8816
8817         if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
8818                 data->period = nr;
8819                 return perf_swevent_overflow(event, 1, data, regs);
8820         } else
8821                 data->period = event->hw.last_period;
8822
8823         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
8824                 return perf_swevent_overflow(event, 1, data, regs);
8825
8826         if (local64_add_negative(nr, &hwc->period_left))
8827                 return;
8828
8829         perf_swevent_overflow(event, 0, data, regs);
8830 }
8831
8832 static int perf_exclude_event(struct perf_event *event,
8833                               struct pt_regs *regs)
8834 {
8835         if (event->hw.state & PERF_HES_STOPPED)
8836                 return 1;
8837
8838         if (regs) {
8839                 if (event->attr.exclude_user && user_mode(regs))
8840                         return 1;
8841
8842                 if (event->attr.exclude_kernel && !user_mode(regs))
8843                         return 1;
8844         }
8845
8846         return 0;
8847 }
8848
8849 static int perf_swevent_match(struct perf_event *event,
8850                                 enum perf_type_id type,
8851                                 u32 event_id,
8852                                 struct perf_sample_data *data,
8853                                 struct pt_regs *regs)
8854 {
8855         if (event->attr.type != type)
8856                 return 0;
8857
8858         if (event->attr.config != event_id)
8859                 return 0;
8860
8861         if (perf_exclude_event(event, regs))
8862                 return 0;
8863
8864         return 1;
8865 }
8866
8867 static inline u64 swevent_hash(u64 type, u32 event_id)
8868 {
8869         u64 val = event_id | (type << 32);
8870
8871         return hash_64(val, SWEVENT_HLIST_BITS);
8872 }
8873
8874 static inline struct hlist_head *
8875 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
8876 {
8877         u64 hash = swevent_hash(type, event_id);
8878
8879         return &hlist->heads[hash];
8880 }
8881
8882 /* For the read side: events when they trigger */
8883 static inline struct hlist_head *
8884 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
8885 {
8886         struct swevent_hlist *hlist;
8887
8888         hlist = rcu_dereference(swhash->swevent_hlist);
8889         if (!hlist)
8890                 return NULL;
8891
8892         return __find_swevent_head(hlist, type, event_id);
8893 }
8894
8895 /* For the event head insertion and removal in the hlist */
8896 static inline struct hlist_head *
8897 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
8898 {
8899         struct swevent_hlist *hlist;
8900         u32 event_id = event->attr.config;
8901         u64 type = event->attr.type;
8902
8903         /*
8904          * Event scheduling is always serialized against hlist allocation
8905          * and release. Which makes the protected version suitable here.
8906          * The context lock guarantees that.
8907          */
8908         hlist = rcu_dereference_protected(swhash->swevent_hlist,
8909                                           lockdep_is_held(&event->ctx->lock));
8910         if (!hlist)
8911                 return NULL;
8912
8913         return __find_swevent_head(hlist, type, event_id);
8914 }
8915
8916 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
8917                                     u64 nr,
8918                                     struct perf_sample_data *data,
8919                                     struct pt_regs *regs)
8920 {
8921         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8922         struct perf_event *event;
8923         struct hlist_head *head;
8924
8925         rcu_read_lock();
8926         head = find_swevent_head_rcu(swhash, type, event_id);
8927         if (!head)
8928                 goto end;
8929
8930         hlist_for_each_entry_rcu(event, head, hlist_entry) {
8931                 if (perf_swevent_match(event, type, event_id, data, regs))
8932                         perf_swevent_event(event, nr, data, regs);
8933         }
8934 end:
8935         rcu_read_unlock();
8936 }
8937
8938 DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
8939
8940 int perf_swevent_get_recursion_context(void)
8941 {
8942         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8943
8944         return get_recursion_context(swhash->recursion);
8945 }
8946 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
8947
8948 void perf_swevent_put_recursion_context(int rctx)
8949 {
8950         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8951
8952         put_recursion_context(swhash->recursion, rctx);
8953 }
8954
8955 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
8956 {
8957         struct perf_sample_data data;
8958
8959         if (WARN_ON_ONCE(!regs))
8960                 return;
8961
8962         perf_sample_data_init(&data, addr, 0);
8963         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
8964 }
8965
8966 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
8967 {
8968         int rctx;
8969
8970         preempt_disable_notrace();
8971         rctx = perf_swevent_get_recursion_context();
8972         if (unlikely(rctx < 0))
8973                 goto fail;
8974
8975         ___perf_sw_event(event_id, nr, regs, addr);
8976
8977         perf_swevent_put_recursion_context(rctx);
8978 fail:
8979         preempt_enable_notrace();
8980 }
8981
8982 static void perf_swevent_read(struct perf_event *event)
8983 {
8984 }
8985
8986 static int perf_swevent_add(struct perf_event *event, int flags)
8987 {
8988         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8989         struct hw_perf_event *hwc = &event->hw;
8990         struct hlist_head *head;
8991
8992         if (is_sampling_event(event)) {
8993                 hwc->last_period = hwc->sample_period;
8994                 perf_swevent_set_period(event);
8995         }
8996
8997         hwc->state = !(flags & PERF_EF_START);
8998
8999         head = find_swevent_head(swhash, event);
9000         if (WARN_ON_ONCE(!head))
9001                 return -EINVAL;
9002
9003         hlist_add_head_rcu(&event->hlist_entry, head);
9004         perf_event_update_userpage(event);
9005
9006         return 0;
9007 }
9008
9009 static void perf_swevent_del(struct perf_event *event, int flags)
9010 {
9011         hlist_del_rcu(&event->hlist_entry);
9012 }
9013
9014 static void perf_swevent_start(struct perf_event *event, int flags)
9015 {
9016         event->hw.state = 0;
9017 }
9018
9019 static void perf_swevent_stop(struct perf_event *event, int flags)
9020 {
9021         event->hw.state = PERF_HES_STOPPED;
9022 }
9023
9024 /* Deref the hlist from the update side */
9025 static inline struct swevent_hlist *
9026 swevent_hlist_deref(struct swevent_htable *swhash)
9027 {
9028         return rcu_dereference_protected(swhash->swevent_hlist,
9029                                          lockdep_is_held(&swhash->hlist_mutex));
9030 }
9031
9032 static void swevent_hlist_release(struct swevent_htable *swhash)
9033 {
9034         struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
9035
9036         if (!hlist)
9037                 return;
9038
9039         RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
9040         kfree_rcu(hlist, rcu_head);
9041 }
9042
9043 static void swevent_hlist_put_cpu(int cpu)
9044 {
9045         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9046
9047         mutex_lock(&swhash->hlist_mutex);
9048
9049         if (!--swhash->hlist_refcount)
9050                 swevent_hlist_release(swhash);
9051
9052         mutex_unlock(&swhash->hlist_mutex);
9053 }
9054
9055 static void swevent_hlist_put(void)
9056 {
9057         int cpu;
9058
9059         for_each_possible_cpu(cpu)
9060                 swevent_hlist_put_cpu(cpu);
9061 }
9062
9063 static int swevent_hlist_get_cpu(int cpu)
9064 {
9065         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9066         int err = 0;
9067
9068         mutex_lock(&swhash->hlist_mutex);
9069         if (!swevent_hlist_deref(swhash) &&
9070             cpumask_test_cpu(cpu, perf_online_mask)) {
9071                 struct swevent_hlist *hlist;
9072
9073                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
9074                 if (!hlist) {
9075                         err = -ENOMEM;
9076                         goto exit;
9077                 }
9078                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
9079         }
9080         swhash->hlist_refcount++;
9081 exit:
9082         mutex_unlock(&swhash->hlist_mutex);
9083
9084         return err;
9085 }
9086
9087 static int swevent_hlist_get(void)
9088 {
9089         int err, cpu, failed_cpu;
9090
9091         mutex_lock(&pmus_lock);
9092         for_each_possible_cpu(cpu) {
9093                 err = swevent_hlist_get_cpu(cpu);
9094                 if (err) {
9095                         failed_cpu = cpu;
9096                         goto fail;
9097                 }
9098         }
9099         mutex_unlock(&pmus_lock);
9100         return 0;
9101 fail:
9102         for_each_possible_cpu(cpu) {
9103                 if (cpu == failed_cpu)
9104                         break;
9105                 swevent_hlist_put_cpu(cpu);
9106         }
9107         mutex_unlock(&pmus_lock);
9108         return err;
9109 }
9110
9111 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
9112
9113 static void sw_perf_event_destroy(struct perf_event *event)
9114 {
9115         u64 event_id = event->attr.config;
9116
9117         WARN_ON(event->parent);
9118
9119         static_key_slow_dec(&perf_swevent_enabled[event_id]);
9120         swevent_hlist_put();
9121 }
9122
9123 static int perf_swevent_init(struct perf_event *event)
9124 {
9125         u64 event_id = event->attr.config;
9126
9127         if (event->attr.type != PERF_TYPE_SOFTWARE)
9128                 return -ENOENT;
9129
9130         /*
9131          * no branch sampling for software events
9132          */
9133         if (has_branch_stack(event))
9134                 return -EOPNOTSUPP;
9135
9136         switch (event_id) {
9137         case PERF_COUNT_SW_CPU_CLOCK:
9138         case PERF_COUNT_SW_TASK_CLOCK:
9139                 return -ENOENT;
9140
9141         default:
9142                 break;
9143         }
9144
9145         if (event_id >= PERF_COUNT_SW_MAX)
9146                 return -ENOENT;
9147
9148         if (!event->parent) {
9149                 int err;
9150
9151                 err = swevent_hlist_get();
9152                 if (err)
9153                         return err;
9154
9155                 static_key_slow_inc(&perf_swevent_enabled[event_id]);
9156                 event->destroy = sw_perf_event_destroy;
9157         }
9158
9159         return 0;
9160 }
9161
9162 static struct pmu perf_swevent = {
9163         .task_ctx_nr    = perf_sw_context,
9164
9165         .capabilities   = PERF_PMU_CAP_NO_NMI,
9166
9167         .event_init     = perf_swevent_init,
9168         .add            = perf_swevent_add,
9169         .del            = perf_swevent_del,
9170         .start          = perf_swevent_start,
9171         .stop           = perf_swevent_stop,
9172         .read           = perf_swevent_read,
9173 };
9174
9175 #ifdef CONFIG_EVENT_TRACING
9176
9177 static int perf_tp_filter_match(struct perf_event *event,
9178                                 struct perf_sample_data *data)
9179 {
9180         void *record = data->raw->frag.data;
9181
9182         /* only top level events have filters set */
9183         if (event->parent)
9184                 event = event->parent;
9185
9186         if (likely(!event->filter) || filter_match_preds(event->filter, record))
9187                 return 1;
9188         return 0;
9189 }
9190
9191 static int perf_tp_event_match(struct perf_event *event,
9192                                 struct perf_sample_data *data,
9193                                 struct pt_regs *regs)
9194 {
9195         if (event->hw.state & PERF_HES_STOPPED)
9196                 return 0;
9197         /*
9198          * If exclude_kernel, only trace user-space tracepoints (uprobes)
9199          */
9200         if (event->attr.exclude_kernel && !user_mode(regs))
9201                 return 0;
9202
9203         if (!perf_tp_filter_match(event, data))
9204                 return 0;
9205
9206         return 1;
9207 }
9208
9209 void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
9210                                struct trace_event_call *call, u64 count,
9211                                struct pt_regs *regs, struct hlist_head *head,
9212                                struct task_struct *task)
9213 {
9214         if (bpf_prog_array_valid(call)) {
9215                 *(struct pt_regs **)raw_data = regs;
9216                 if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
9217                         perf_swevent_put_recursion_context(rctx);
9218                         return;
9219                 }
9220         }
9221         perf_tp_event(call->event.type, count, raw_data, size, regs, head,
9222                       rctx, task);
9223 }
9224 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
9225
9226 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
9227                    struct pt_regs *regs, struct hlist_head *head, int rctx,
9228                    struct task_struct *task)
9229 {
9230         struct perf_sample_data data;
9231         struct perf_event *event;
9232
9233         struct perf_raw_record raw = {
9234                 .frag = {
9235                         .size = entry_size,
9236                         .data = record,
9237                 },
9238         };
9239
9240         perf_sample_data_init(&data, 0, 0);
9241         data.raw = &raw;
9242
9243         perf_trace_buf_update(record, event_type);
9244
9245         hlist_for_each_entry_rcu(event, head, hlist_entry) {
9246                 if (perf_tp_event_match(event, &data, regs))
9247                         perf_swevent_event(event, count, &data, regs);
9248         }
9249
9250         /*
9251          * If we got specified a target task, also iterate its context and
9252          * deliver this event there too.
9253          */
9254         if (task && task != current) {
9255                 struct perf_event_context *ctx;
9256                 struct trace_entry *entry = record;
9257
9258                 rcu_read_lock();
9259                 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
9260                 if (!ctx)
9261                         goto unlock;
9262
9263                 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
9264                         if (event->cpu != smp_processor_id())
9265                                 continue;
9266                         if (event->attr.type != PERF_TYPE_TRACEPOINT)
9267                                 continue;
9268                         if (event->attr.config != entry->type)
9269                                 continue;
9270                         if (perf_tp_event_match(event, &data, regs))
9271                                 perf_swevent_event(event, count, &data, regs);
9272                 }
9273 unlock:
9274                 rcu_read_unlock();
9275         }
9276
9277         perf_swevent_put_recursion_context(rctx);
9278 }
9279 EXPORT_SYMBOL_GPL(perf_tp_event);
9280
9281 static void tp_perf_event_destroy(struct perf_event *event)
9282 {
9283         perf_trace_destroy(event);
9284 }
9285
9286 static int perf_tp_event_init(struct perf_event *event)
9287 {
9288         int err;
9289
9290         if (event->attr.type != PERF_TYPE_TRACEPOINT)
9291                 return -ENOENT;
9292
9293         /*
9294          * no branch sampling for tracepoint events
9295          */
9296         if (has_branch_stack(event))
9297                 return -EOPNOTSUPP;
9298
9299         err = perf_trace_init(event);
9300         if (err)
9301                 return err;
9302
9303         event->destroy = tp_perf_event_destroy;
9304
9305         return 0;
9306 }
9307
9308 static struct pmu perf_tracepoint = {
9309         .task_ctx_nr    = perf_sw_context,
9310
9311         .event_init     = perf_tp_event_init,
9312         .add            = perf_trace_add,
9313         .del            = perf_trace_del,
9314         .start          = perf_swevent_start,
9315         .stop           = perf_swevent_stop,
9316         .read           = perf_swevent_read,
9317 };
9318
9319 #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
9320 /*
9321  * Flags in config, used by dynamic PMU kprobe and uprobe
9322  * The flags should match following PMU_FORMAT_ATTR().
9323  *
9324  * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
9325  *                               if not set, create kprobe/uprobe
9326  *
9327  * The following values specify a reference counter (or semaphore in the
9328  * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
9329  * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
9330  *
9331  * PERF_UPROBE_REF_CTR_OFFSET_BITS      # of bits in config as th offset
9332  * PERF_UPROBE_REF_CTR_OFFSET_SHIFT     # of bits to shift left
9333  */
9334 enum perf_probe_config {
9335         PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
9336         PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
9337         PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
9338 };
9339
9340 PMU_FORMAT_ATTR(retprobe, "config:0");
9341 #endif
9342
9343 #ifdef CONFIG_KPROBE_EVENTS
9344 static struct attribute *kprobe_attrs[] = {
9345         &format_attr_retprobe.attr,
9346         NULL,
9347 };
9348
9349 static struct attribute_group kprobe_format_group = {
9350         .name = "format",
9351         .attrs = kprobe_attrs,
9352 };
9353
9354 static const struct attribute_group *kprobe_attr_groups[] = {
9355         &kprobe_format_group,
9356         NULL,
9357 };
9358
9359 static int perf_kprobe_event_init(struct perf_event *event);
9360 static struct pmu perf_kprobe = {
9361         .task_ctx_nr    = perf_sw_context,
9362         .event_init     = perf_kprobe_event_init,
9363         .add            = perf_trace_add,
9364         .del            = perf_trace_del,
9365         .start          = perf_swevent_start,
9366         .stop           = perf_swevent_stop,
9367         .read           = perf_swevent_read,
9368         .attr_groups    = kprobe_attr_groups,
9369 };
9370
9371 static int perf_kprobe_event_init(struct perf_event *event)
9372 {
9373         int err;
9374         bool is_retprobe;
9375
9376         if (event->attr.type != perf_kprobe.type)
9377                 return -ENOENT;
9378
9379         if (!capable(CAP_SYS_ADMIN))
9380                 return -EACCES;
9381
9382         /*
9383          * no branch sampling for probe events
9384          */
9385         if (has_branch_stack(event))
9386                 return -EOPNOTSUPP;
9387
9388         is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
9389         err = perf_kprobe_init(event, is_retprobe);
9390         if (err)
9391                 return err;
9392
9393         event->destroy = perf_kprobe_destroy;
9394
9395         return 0;
9396 }
9397 #endif /* CONFIG_KPROBE_EVENTS */
9398
9399 #ifdef CONFIG_UPROBE_EVENTS
9400 PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
9401
9402 static struct attribute *uprobe_attrs[] = {
9403         &format_attr_retprobe.attr,
9404         &format_attr_ref_ctr_offset.attr,
9405         NULL,
9406 };
9407
9408 static struct attribute_group uprobe_format_group = {
9409         .name = "format",
9410         .attrs = uprobe_attrs,
9411 };
9412
9413 static const struct attribute_group *uprobe_attr_groups[] = {
9414         &uprobe_format_group,
9415         NULL,
9416 };
9417
9418 static int perf_uprobe_event_init(struct perf_event *event);
9419 static struct pmu perf_uprobe = {
9420         .task_ctx_nr    = perf_sw_context,
9421         .event_init     = perf_uprobe_event_init,
9422         .add            = perf_trace_add,
9423         .del            = perf_trace_del,
9424         .start          = perf_swevent_start,
9425         .stop           = perf_swevent_stop,
9426         .read           = perf_swevent_read,
9427         .attr_groups    = uprobe_attr_groups,
9428 };
9429
9430 static int perf_uprobe_event_init(struct perf_event *event)
9431 {
9432         int err;
9433         unsigned long ref_ctr_offset;
9434         bool is_retprobe;
9435
9436         if (event->attr.type != perf_uprobe.type)
9437                 return -ENOENT;
9438
9439         if (!capable(CAP_SYS_ADMIN))
9440                 return -EACCES;
9441
9442         /*
9443          * no branch sampling for probe events
9444          */
9445         if (has_branch_stack(event))
9446                 return -EOPNOTSUPP;
9447
9448         is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
9449         ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
9450         err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
9451         if (err)
9452                 return err;
9453
9454         event->destroy = perf_uprobe_destroy;
9455
9456         return 0;
9457 }
9458 #endif /* CONFIG_UPROBE_EVENTS */
9459
9460 static inline void perf_tp_register(void)
9461 {
9462         perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
9463 #ifdef CONFIG_KPROBE_EVENTS
9464         perf_pmu_register(&perf_kprobe, "kprobe", -1);
9465 #endif
9466 #ifdef CONFIG_UPROBE_EVENTS
9467         perf_pmu_register(&perf_uprobe, "uprobe", -1);
9468 #endif
9469 }
9470
9471 static void perf_event_free_filter(struct perf_event *event)
9472 {
9473         ftrace_profile_free_filter(event);
9474 }
9475
9476 #ifdef CONFIG_BPF_SYSCALL
9477 static void bpf_overflow_handler(struct perf_event *event,
9478                                  struct perf_sample_data *data,
9479                                  struct pt_regs *regs)
9480 {
9481         struct bpf_perf_event_data_kern ctx = {
9482                 .data = data,
9483                 .event = event,
9484         };
9485         int ret = 0;
9486
9487         ctx.regs = perf_arch_bpf_user_pt_regs(regs);
9488         if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
9489                 goto out;
9490         rcu_read_lock();
9491         ret = BPF_PROG_RUN(event->prog, &ctx);
9492         rcu_read_unlock();
9493 out:
9494         __this_cpu_dec(bpf_prog_active);
9495         if (!ret)
9496                 return;
9497
9498         event->orig_overflow_handler(event, data, regs);
9499 }
9500
9501 static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
9502 {
9503         struct bpf_prog *prog;
9504
9505         if (event->overflow_handler_context)
9506                 /* hw breakpoint or kernel counter */
9507                 return -EINVAL;
9508
9509         if (event->prog)
9510                 return -EEXIST;
9511
9512         prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
9513         if (IS_ERR(prog))
9514                 return PTR_ERR(prog);
9515
9516         event->prog = prog;
9517         event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
9518         WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
9519         return 0;
9520 }
9521
9522 static void perf_event_free_bpf_handler(struct perf_event *event)
9523 {
9524         struct bpf_prog *prog = event->prog;
9525
9526         if (!prog)
9527                 return;
9528
9529         WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
9530         event->prog = NULL;
9531         bpf_prog_put(prog);
9532 }
9533 #else
9534 static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
9535 {
9536         return -EOPNOTSUPP;
9537 }
9538 static void perf_event_free_bpf_handler(struct perf_event *event)
9539 {
9540 }
9541 #endif
9542
9543 /*
9544  * returns true if the event is a tracepoint, or a kprobe/upprobe created
9545  * with perf_event_open()
9546  */
9547 static inline bool perf_event_is_tracing(struct perf_event *event)
9548 {
9549         if (event->pmu == &perf_tracepoint)
9550                 return true;
9551 #ifdef CONFIG_KPROBE_EVENTS
9552         if (event->pmu == &perf_kprobe)
9553                 return true;
9554 #endif
9555 #ifdef CONFIG_UPROBE_EVENTS
9556         if (event->pmu == &perf_uprobe)
9557                 return true;
9558 #endif
9559         return false;
9560 }
9561
9562 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
9563 {
9564         bool is_kprobe, is_tracepoint, is_syscall_tp;
9565         struct bpf_prog *prog;
9566         int ret;
9567
9568         if (!perf_event_is_tracing(event))
9569                 return perf_event_set_bpf_handler(event, prog_fd);
9570
9571         is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
9572         is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
9573         is_syscall_tp = is_syscall_trace_event(event->tp_event);
9574         if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
9575                 /* bpf programs can only be attached to u/kprobe or tracepoint */
9576                 return -EINVAL;
9577
9578         prog = bpf_prog_get(prog_fd);
9579         if (IS_ERR(prog))
9580                 return PTR_ERR(prog);
9581
9582         if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
9583             (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
9584             (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
9585                 /* valid fd, but invalid bpf program type */
9586                 bpf_prog_put(prog);
9587                 return -EINVAL;
9588         }
9589
9590         /* Kprobe override only works for kprobes, not uprobes. */
9591         if (prog->kprobe_override &&
9592             !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
9593                 bpf_prog_put(prog);
9594                 return -EINVAL;
9595         }
9596
9597         if (is_tracepoint || is_syscall_tp) {
9598                 int off = trace_event_get_offsets(event->tp_event);
9599
9600                 if (prog->aux->max_ctx_offset > off) {
9601                         bpf_prog_put(prog);
9602                         return -EACCES;
9603                 }
9604         }
9605
9606         ret = perf_event_attach_bpf_prog(event, prog);
9607         if (ret)
9608                 bpf_prog_put(prog);
9609         return ret;
9610 }
9611
9612 static void perf_event_free_bpf_prog(struct perf_event *event)
9613 {
9614         if (!perf_event_is_tracing(event)) {
9615                 perf_event_free_bpf_handler(event);
9616                 return;
9617         }
9618         perf_event_detach_bpf_prog(event);
9619 }
9620
9621 #else
9622
9623 static inline void perf_tp_register(void)
9624 {
9625 }
9626
9627 static void perf_event_free_filter(struct perf_event *event)
9628 {
9629 }
9630
9631 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
9632 {
9633         return -ENOENT;
9634 }
9635
9636 static void perf_event_free_bpf_prog(struct perf_event *event)
9637 {
9638 }
9639 #endif /* CONFIG_EVENT_TRACING */
9640
9641 #ifdef CONFIG_HAVE_HW_BREAKPOINT
9642 void perf_bp_event(struct perf_event *bp, void *data)
9643 {
9644         struct perf_sample_data sample;
9645         struct pt_regs *regs = data;
9646
9647         perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
9648
9649         if (!bp->hw.state && !perf_exclude_event(bp, regs))
9650                 perf_swevent_event(bp, 1, &sample, regs);
9651 }
9652 #endif
9653
9654 /*
9655  * Allocate a new address filter
9656  */
9657 static struct perf_addr_filter *
9658 perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
9659 {
9660         int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
9661         struct perf_addr_filter *filter;
9662
9663         filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
9664         if (!filter)
9665                 return NULL;
9666
9667         INIT_LIST_HEAD(&filter->entry);
9668         list_add_tail(&filter->entry, filters);
9669
9670         return filter;
9671 }
9672
9673 static void free_filters_list(struct list_head *filters)
9674 {
9675         struct perf_addr_filter *filter, *iter;
9676
9677         list_for_each_entry_safe(filter, iter, filters, entry) {
9678                 path_put(&filter->path);
9679                 list_del(&filter->entry);
9680                 kfree(filter);
9681         }
9682 }
9683
9684 /*
9685  * Free existing address filters and optionally install new ones
9686  */
9687 static void perf_addr_filters_splice(struct perf_event *event,
9688                                      struct list_head *head)
9689 {
9690         unsigned long flags;
9691         LIST_HEAD(list);
9692
9693         if (!has_addr_filter(event))
9694                 return;
9695
9696         /* don't bother with children, they don't have their own filters */
9697         if (event->parent)
9698                 return;
9699
9700         raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
9701
9702         list_splice_init(&event->addr_filters.list, &list);
9703         if (head)
9704                 list_splice(head, &event->addr_filters.list);
9705
9706         raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
9707
9708         free_filters_list(&list);
9709 }
9710
9711 /*
9712  * Scan through mm's vmas and see if one of them matches the
9713  * @filter; if so, adjust filter's address range.
9714  * Called with mm::mmap_sem down for reading.
9715  */
9716 static void perf_addr_filter_apply(struct perf_addr_filter *filter,
9717                                    struct mm_struct *mm,
9718                                    struct perf_addr_filter_range *fr)
9719 {
9720         struct vm_area_struct *vma;
9721
9722         for (vma = mm->mmap; vma; vma = vma->vm_next) {
9723                 if (!vma->vm_file)
9724                         continue;
9725
9726                 if (perf_addr_filter_vma_adjust(filter, vma, fr))
9727                         return;
9728         }
9729 }
9730
9731 /*
9732  * Update event's address range filters based on the
9733  * task's existing mappings, if any.
9734  */
9735 static void perf_event_addr_filters_apply(struct perf_event *event)
9736 {
9737         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
9738         struct task_struct *task = READ_ONCE(event->ctx->task);
9739         struct perf_addr_filter *filter;
9740         struct mm_struct *mm = NULL;
9741         unsigned int count = 0;
9742         unsigned long flags;
9743
9744         /*
9745          * We may observe TASK_TOMBSTONE, which means that the event tear-down
9746          * will stop on the parent's child_mutex that our caller is also holding
9747          */
9748         if (task == TASK_TOMBSTONE)
9749                 return;
9750
9751         if (ifh->nr_file_filters) {
9752                 mm = get_task_mm(event->ctx->task);
9753                 if (!mm)
9754                         goto restart;
9755
9756                 down_read(&mm->mmap_sem);
9757         }
9758
9759         raw_spin_lock_irqsave(&ifh->lock, flags);
9760         list_for_each_entry(filter, &ifh->list, entry) {
9761                 if (filter->path.dentry) {
9762                         /*
9763                          * Adjust base offset if the filter is associated to a
9764                          * binary that needs to be mapped:
9765                          */
9766                         event->addr_filter_ranges[count].start = 0;
9767                         event->addr_filter_ranges[count].size = 0;
9768
9769                         perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
9770                 } else {
9771                         event->addr_filter_ranges[count].start = filter->offset;
9772                         event->addr_filter_ranges[count].size  = filter->size;
9773                 }
9774
9775                 count++;
9776         }
9777
9778         event->addr_filters_gen++;
9779         raw_spin_unlock_irqrestore(&ifh->lock, flags);
9780
9781         if (ifh->nr_file_filters) {
9782                 up_read(&mm->mmap_sem);
9783
9784                 mmput(mm);
9785         }
9786
9787 restart:
9788         perf_event_stop(event, 1);
9789 }
9790
9791 /*
9792  * Address range filtering: limiting the data to certain
9793  * instruction address ranges. Filters are ioctl()ed to us from
9794  * userspace as ascii strings.
9795  *
9796  * Filter string format:
9797  *
9798  * ACTION RANGE_SPEC
9799  * where ACTION is one of the
9800  *  * "filter": limit the trace to this region
9801  *  * "start": start tracing from this address
9802  *  * "stop": stop tracing at this address/region;
9803  * RANGE_SPEC is
9804  *  * for kernel addresses: <start address>[/<size>]
9805  *  * for object files:     <start address>[/<size>]@</path/to/object/file>
9806  *
9807  * if <size> is not specified or is zero, the range is treated as a single
9808  * address; not valid for ACTION=="filter".
9809  */
9810 enum {
9811         IF_ACT_NONE = -1,
9812         IF_ACT_FILTER,
9813         IF_ACT_START,
9814         IF_ACT_STOP,
9815         IF_SRC_FILE,
9816         IF_SRC_KERNEL,
9817         IF_SRC_FILEADDR,
9818         IF_SRC_KERNELADDR,
9819 };
9820
9821 enum {
9822         IF_STATE_ACTION = 0,
9823         IF_STATE_SOURCE,
9824         IF_STATE_END,
9825 };
9826
9827 static const match_table_t if_tokens = {
9828         { IF_ACT_FILTER,        "filter" },
9829         { IF_ACT_START,         "start" },
9830         { IF_ACT_STOP,          "stop" },
9831         { IF_SRC_FILE,          "%u/%u@%s" },
9832         { IF_SRC_KERNEL,        "%u/%u" },
9833         { IF_SRC_FILEADDR,      "%u@%s" },
9834         { IF_SRC_KERNELADDR,    "%u" },
9835         { IF_ACT_NONE,          NULL },
9836 };
9837
9838 /*
9839  * Address filter string parser
9840  */
9841 static int
9842 perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
9843                              struct list_head *filters)
9844 {
9845         struct perf_addr_filter *filter = NULL;
9846         char *start, *orig, *filename = NULL;
9847         substring_t args[MAX_OPT_ARGS];
9848         int state = IF_STATE_ACTION, token;
9849         unsigned int kernel = 0;
9850         int ret = -EINVAL;
9851
9852         orig = fstr = kstrdup(fstr, GFP_KERNEL);
9853         if (!fstr)
9854                 return -ENOMEM;
9855
9856         while ((start = strsep(&fstr, " ,\n")) != NULL) {
9857                 static const enum perf_addr_filter_action_t actions[] = {
9858                         [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
9859                         [IF_ACT_START]  = PERF_ADDR_FILTER_ACTION_START,
9860                         [IF_ACT_STOP]   = PERF_ADDR_FILTER_ACTION_STOP,
9861                 };
9862                 ret = -EINVAL;
9863
9864                 if (!*start)
9865                         continue;
9866
9867                 /* filter definition begins */
9868                 if (state == IF_STATE_ACTION) {
9869                         filter = perf_addr_filter_new(event, filters);
9870                         if (!filter)
9871                                 goto fail;
9872                 }
9873
9874                 token = match_token(start, if_tokens, args);
9875                 switch (token) {
9876                 case IF_ACT_FILTER:
9877                 case IF_ACT_START:
9878                 case IF_ACT_STOP:
9879                         if (state != IF_STATE_ACTION)
9880                                 goto fail;
9881
9882                         filter->action = actions[token];
9883                         state = IF_STATE_SOURCE;
9884                         break;
9885
9886                 case IF_SRC_KERNELADDR:
9887                 case IF_SRC_KERNEL:
9888                         kernel = 1;
9889                         /* fall through */
9890
9891                 case IF_SRC_FILEADDR:
9892                 case IF_SRC_FILE:
9893                         if (state != IF_STATE_SOURCE)
9894                                 goto fail;
9895
9896                         *args[0].to = 0;
9897                         ret = kstrtoul(args[0].from, 0, &filter->offset);
9898                         if (ret)
9899                                 goto fail;
9900
9901                         if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
9902                                 *args[1].to = 0;
9903                                 ret = kstrtoul(args[1].from, 0, &filter->size);
9904                                 if (ret)
9905                                         goto fail;
9906                         }
9907
9908                         if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
9909                                 int fpos = token == IF_SRC_FILE ? 2 : 1;
9910
9911                                 filename = match_strdup(&args[fpos]);
9912                                 if (!filename) {
9913                                         ret = -ENOMEM;
9914                                         goto fail;
9915                                 }
9916                         }
9917
9918                         state = IF_STATE_END;
9919                         break;
9920
9921                 default:
9922                         goto fail;
9923                 }
9924
9925                 /*
9926                  * Filter definition is fully parsed, validate and install it.
9927                  * Make sure that it doesn't contradict itself or the event's
9928                  * attribute.
9929                  */
9930                 if (state == IF_STATE_END) {
9931                         ret = -EINVAL;
9932                         if (kernel && event->attr.exclude_kernel)
9933                                 goto fail;
9934
9935                         /*
9936                          * ACTION "filter" must have a non-zero length region
9937                          * specified.
9938                          */
9939                         if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
9940                             !filter->size)
9941                                 goto fail;
9942
9943                         if (!kernel) {
9944                                 if (!filename)
9945                                         goto fail;
9946
9947                                 /*
9948                                  * For now, we only support file-based filters
9949                                  * in per-task events; doing so for CPU-wide
9950                                  * events requires additional context switching
9951                                  * trickery, since same object code will be
9952                                  * mapped at different virtual addresses in
9953                                  * different processes.
9954                                  */
9955                                 ret = -EOPNOTSUPP;
9956                                 if (!event->ctx->task)
9957                                         goto fail_free_name;
9958
9959                                 /* look up the path and grab its inode */
9960                                 ret = kern_path(filename, LOOKUP_FOLLOW,
9961                                                 &filter->path);
9962                                 if (ret)
9963                                         goto fail_free_name;
9964
9965                                 kfree(filename);
9966                                 filename = NULL;
9967
9968                                 ret = -EINVAL;
9969                                 if (!filter->path.dentry ||
9970                                     !S_ISREG(d_inode(filter->path.dentry)
9971                                              ->i_mode))
9972                                         goto fail;
9973
9974                                 event->addr_filters.nr_file_filters++;
9975                         }
9976
9977                         /* ready to consume more filters */
9978                         state = IF_STATE_ACTION;
9979                         filter = NULL;
9980                 }
9981         }
9982
9983         if (state != IF_STATE_ACTION)
9984                 goto fail;
9985
9986         kfree(orig);
9987
9988         return 0;
9989
9990 fail_free_name:
9991         kfree(filename);
9992 fail:
9993         free_filters_list(filters);
9994         kfree(orig);
9995
9996         return ret;
9997 }
9998
9999 static int
10000 perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
10001 {
10002         LIST_HEAD(filters);
10003         int ret;
10004
10005         /*
10006          * Since this is called in perf_ioctl() path, we're already holding
10007          * ctx::mutex.
10008          */
10009         lockdep_assert_held(&event->ctx->mutex);
10010
10011         if (WARN_ON_ONCE(event->parent))
10012                 return -EINVAL;
10013
10014         ret = perf_event_parse_addr_filter(event, filter_str, &filters);
10015         if (ret)
10016                 goto fail_clear_files;
10017
10018         ret = event->pmu->addr_filters_validate(&filters);
10019         if (ret)
10020                 goto fail_free_filters;
10021
10022         /* remove existing filters, if any */
10023         perf_addr_filters_splice(event, &filters);
10024
10025         /* install new filters */
10026         perf_event_for_each_child(event, perf_event_addr_filters_apply);
10027
10028         return ret;
10029
10030 fail_free_filters:
10031         free_filters_list(&filters);
10032
10033 fail_clear_files:
10034         event->addr_filters.nr_file_filters = 0;
10035
10036         return ret;
10037 }
10038
10039 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
10040 {
10041         int ret = -EINVAL;
10042         char *filter_str;
10043
10044         filter_str = strndup_user(arg, PAGE_SIZE);
10045         if (IS_ERR(filter_str))
10046                 return PTR_ERR(filter_str);
10047
10048 #ifdef CONFIG_EVENT_TRACING
10049         if (perf_event_is_tracing(event)) {
10050                 struct perf_event_context *ctx = event->ctx;
10051
10052                 /*
10053                  * Beware, here be dragons!!
10054                  *
10055                  * the tracepoint muck will deadlock against ctx->mutex, but
10056                  * the tracepoint stuff does not actually need it. So
10057                  * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
10058                  * already have a reference on ctx.
10059                  *
10060                  * This can result in event getting moved to a different ctx,
10061                  * but that does not affect the tracepoint state.
10062                  */
10063                 mutex_unlock(&ctx->mutex);
10064                 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
10065                 mutex_lock(&ctx->mutex);
10066         } else
10067 #endif
10068         if (has_addr_filter(event))
10069                 ret = perf_event_set_addr_filter(event, filter_str);
10070
10071         kfree(filter_str);
10072         return ret;
10073 }
10074
10075 /*
10076  * hrtimer based swevent callback
10077  */
10078
10079 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
10080 {
10081         enum hrtimer_restart ret = HRTIMER_RESTART;
10082         struct perf_sample_data data;
10083         struct pt_regs *regs;
10084         struct perf_event *event;
10085         u64 period;
10086
10087         event = container_of(hrtimer, struct perf_event, hw.hrtimer);
10088
10089         if (event->state != PERF_EVENT_STATE_ACTIVE)
10090                 return HRTIMER_NORESTART;
10091
10092         event->pmu->read(event);
10093
10094         perf_sample_data_init(&data, 0, event->hw.last_period);
10095         regs = get_irq_regs();
10096
10097         if (regs && !perf_exclude_event(event, regs)) {
10098                 if (!(event->attr.exclude_idle && is_idle_task(current)))
10099                         if (__perf_event_overflow(event, 1, &data, regs))
10100                                 ret = HRTIMER_NORESTART;
10101         }
10102
10103         period = max_t(u64, 10000, event->hw.sample_period);
10104         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
10105
10106         return ret;
10107 }
10108
10109 static void perf_swevent_start_hrtimer(struct perf_event *event)
10110 {
10111         struct hw_perf_event *hwc = &event->hw;
10112         s64 period;
10113
10114         if (!is_sampling_event(event))
10115                 return;
10116
10117         period = local64_read(&hwc->period_left);
10118         if (period) {
10119                 if (period < 0)
10120                         period = 10000;
10121
10122                 local64_set(&hwc->period_left, 0);
10123         } else {
10124                 period = max_t(u64, 10000, hwc->sample_period);
10125         }
10126         hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
10127                       HRTIMER_MODE_REL_PINNED_HARD);
10128 }
10129
10130 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
10131 {
10132         struct hw_perf_event *hwc = &event->hw;
10133
10134         if (is_sampling_event(event)) {
10135                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
10136                 local64_set(&hwc->period_left, ktime_to_ns(remaining));
10137
10138                 hrtimer_cancel(&hwc->hrtimer);
10139         }
10140 }
10141
10142 static void perf_swevent_init_hrtimer(struct perf_event *event)
10143 {
10144         struct hw_perf_event *hwc = &event->hw;
10145
10146         if (!is_sampling_event(event))
10147                 return;
10148
10149         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
10150         hwc->hrtimer.function = perf_swevent_hrtimer;
10151
10152         /*
10153          * Since hrtimers have a fixed rate, we can do a static freq->period
10154          * mapping and avoid the whole period adjust feedback stuff.
10155          */
10156         if (event->attr.freq) {
10157                 long freq = event->attr.sample_freq;
10158
10159                 event->attr.sample_period = NSEC_PER_SEC / freq;
10160                 hwc->sample_period = event->attr.sample_period;
10161                 local64_set(&hwc->period_left, hwc->sample_period);
10162                 hwc->last_period = hwc->sample_period;
10163                 event->attr.freq = 0;
10164         }
10165 }
10166
10167 /*
10168  * Software event: cpu wall time clock
10169  */
10170
10171 static void cpu_clock_event_update(struct perf_event *event)
10172 {
10173         s64 prev;
10174         u64 now;
10175
10176         now = local_clock();
10177         prev = local64_xchg(&event->hw.prev_count, now);
10178         local64_add(now - prev, &event->count);
10179 }
10180
10181 static void cpu_clock_event_start(struct perf_event *event, int flags)
10182 {
10183         local64_set(&event->hw.prev_count, local_clock());
10184         perf_swevent_start_hrtimer(event);
10185 }
10186
10187 static void cpu_clock_event_stop(struct perf_event *event, int flags)
10188 {
10189         perf_swevent_cancel_hrtimer(event);
10190         cpu_clock_event_update(event);
10191 }
10192
10193 static int cpu_clock_event_add(struct perf_event *event, int flags)
10194 {
10195         if (flags & PERF_EF_START)
10196                 cpu_clock_event_start(event, flags);
10197         perf_event_update_userpage(event);
10198
10199         return 0;
10200 }
10201
10202 static void cpu_clock_event_del(struct perf_event *event, int flags)
10203 {
10204         cpu_clock_event_stop(event, flags);
10205 }
10206
10207 static void cpu_clock_event_read(struct perf_event *event)
10208 {
10209         cpu_clock_event_update(event);
10210 }
10211
10212 static int cpu_clock_event_init(struct perf_event *event)
10213 {
10214         if (event->attr.type != PERF_TYPE_SOFTWARE)
10215                 return -ENOENT;
10216
10217         if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
10218                 return -ENOENT;
10219
10220         /*
10221          * no branch sampling for software events
10222          */
10223         if (has_branch_stack(event))
10224                 return -EOPNOTSUPP;
10225
10226         perf_swevent_init_hrtimer(event);
10227
10228         return 0;
10229 }
10230
10231 static struct pmu perf_cpu_clock = {
10232         .task_ctx_nr    = perf_sw_context,
10233
10234         .capabilities   = PERF_PMU_CAP_NO_NMI,
10235
10236         .event_init     = cpu_clock_event_init,
10237         .add            = cpu_clock_event_add,
10238         .del            = cpu_clock_event_del,
10239         .start          = cpu_clock_event_start,
10240         .stop           = cpu_clock_event_stop,
10241         .read           = cpu_clock_event_read,
10242 };
10243
10244 /*
10245  * Software event: task time clock
10246  */
10247
10248 static void task_clock_event_update(struct perf_event *event, u64 now)
10249 {
10250         u64 prev;
10251         s64 delta;
10252
10253         prev = local64_xchg(&event->hw.prev_count, now);
10254         delta = now - prev;
10255         local64_add(delta, &event->count);
10256 }
10257
10258 static void task_clock_event_start(struct perf_event *event, int flags)
10259 {
10260         local64_set(&event->hw.prev_count, event->ctx->time);
10261         perf_swevent_start_hrtimer(event);
10262 }
10263
10264 static void task_clock_event_stop(struct perf_event *event, int flags)
10265 {
10266         perf_swevent_cancel_hrtimer(event);
10267         task_clock_event_update(event, event->ctx->time);
10268 }
10269
10270 static int task_clock_event_add(struct perf_event *event, int flags)
10271 {
10272         if (flags & PERF_EF_START)
10273                 task_clock_event_start(event, flags);
10274         perf_event_update_userpage(event);
10275
10276         return 0;
10277 }
10278
10279 static void task_clock_event_del(struct perf_event *event, int flags)
10280 {
10281         task_clock_event_stop(event, PERF_EF_UPDATE);
10282 }
10283
10284 static void task_clock_event_read(struct perf_event *event)
10285 {
10286         u64 now = perf_clock();
10287         u64 delta = now - event->ctx->timestamp;
10288         u64 time = event->ctx->time + delta;
10289
10290         task_clock_event_update(event, time);
10291 }
10292
10293 static int task_clock_event_init(struct perf_event *event)
10294 {
10295         if (event->attr.type != PERF_TYPE_SOFTWARE)
10296                 return -ENOENT;
10297
10298         if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
10299                 return -ENOENT;
10300
10301         /*
10302          * no branch sampling for software events
10303          */
10304         if (has_branch_stack(event))
10305                 return -EOPNOTSUPP;
10306
10307         perf_swevent_init_hrtimer(event);
10308
10309         return 0;
10310 }
10311
10312 static struct pmu perf_task_clock = {
10313         .task_ctx_nr    = perf_sw_context,
10314
10315         .capabilities   = PERF_PMU_CAP_NO_NMI,
10316
10317         .event_init     = task_clock_event_init,
10318         .add            = task_clock_event_add,
10319         .del            = task_clock_event_del,
10320         .start          = task_clock_event_start,
10321         .stop           = task_clock_event_stop,
10322         .read           = task_clock_event_read,
10323 };
10324
10325 static void perf_pmu_nop_void(struct pmu *pmu)
10326 {
10327 }
10328
10329 static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
10330 {
10331 }
10332
10333 static int perf_pmu_nop_int(struct pmu *pmu)
10334 {
10335         return 0;
10336 }
10337
10338 static int perf_event_nop_int(struct perf_event *event, u64 value)
10339 {
10340         return 0;
10341 }
10342
10343 static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
10344
10345 static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
10346 {
10347         __this_cpu_write(nop_txn_flags, flags);
10348
10349         if (flags & ~PERF_PMU_TXN_ADD)
10350                 return;
10351
10352         perf_pmu_disable(pmu);
10353 }
10354
10355 static int perf_pmu_commit_txn(struct pmu *pmu)
10356 {
10357         unsigned int flags = __this_cpu_read(nop_txn_flags);
10358
10359         __this_cpu_write(nop_txn_flags, 0);
10360
10361         if (flags & ~PERF_PMU_TXN_ADD)
10362                 return 0;
10363
10364         perf_pmu_enable(pmu);
10365         return 0;
10366 }
10367
10368 static void perf_pmu_cancel_txn(struct pmu *pmu)
10369 {
10370         unsigned int flags =  __this_cpu_read(nop_txn_flags);
10371
10372         __this_cpu_write(nop_txn_flags, 0);
10373
10374         if (flags & ~PERF_PMU_TXN_ADD)
10375                 return;
10376
10377         perf_pmu_enable(pmu);
10378 }
10379
10380 static int perf_event_idx_default(struct perf_event *event)
10381 {
10382         return 0;
10383 }
10384
10385 /*
10386  * Ensures all contexts with the same task_ctx_nr have the same
10387  * pmu_cpu_context too.
10388  */
10389 static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
10390 {
10391         struct pmu *pmu;
10392
10393         if (ctxn < 0)
10394                 return NULL;
10395
10396         list_for_each_entry(pmu, &pmus, entry) {
10397                 if (pmu->task_ctx_nr == ctxn)
10398                         return pmu->pmu_cpu_context;
10399         }
10400
10401         return NULL;
10402 }
10403
10404 static void free_pmu_context(struct pmu *pmu)
10405 {
10406         /*
10407          * Static contexts such as perf_sw_context have a global lifetime
10408          * and may be shared between different PMUs. Avoid freeing them
10409          * when a single PMU is going away.
10410          */
10411         if (pmu->task_ctx_nr > perf_invalid_context)
10412                 return;
10413
10414         free_percpu(pmu->pmu_cpu_context);
10415 }
10416
10417 /*
10418  * Let userspace know that this PMU supports address range filtering:
10419  */
10420 static ssize_t nr_addr_filters_show(struct device *dev,
10421                                     struct device_attribute *attr,
10422                                     char *page)
10423 {
10424         struct pmu *pmu = dev_get_drvdata(dev);
10425
10426         return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
10427 }
10428 DEVICE_ATTR_RO(nr_addr_filters);
10429
10430 static struct idr pmu_idr;
10431
10432 static ssize_t
10433 type_show(struct device *dev, struct device_attribute *attr, char *page)
10434 {
10435         struct pmu *pmu = dev_get_drvdata(dev);
10436
10437         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
10438 }
10439 static DEVICE_ATTR_RO(type);
10440
10441 static ssize_t
10442 perf_event_mux_interval_ms_show(struct device *dev,
10443                                 struct device_attribute *attr,
10444                                 char *page)
10445 {
10446         struct pmu *pmu = dev_get_drvdata(dev);
10447
10448         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
10449 }
10450
10451 static DEFINE_MUTEX(mux_interval_mutex);
10452
10453 static ssize_t
10454 perf_event_mux_interval_ms_store(struct device *dev,
10455                                  struct device_attribute *attr,
10456                                  const char *buf, size_t count)
10457 {
10458         struct pmu *pmu = dev_get_drvdata(dev);
10459         int timer, cpu, ret;
10460
10461         ret = kstrtoint(buf, 0, &timer);
10462         if (ret)
10463                 return ret;
10464
10465         if (timer < 1)
10466                 return -EINVAL;
10467
10468         /* same value, noting to do */
10469         if (timer == pmu->hrtimer_interval_ms)
10470                 return count;
10471
10472         mutex_lock(&mux_interval_mutex);
10473         pmu->hrtimer_interval_ms = timer;
10474
10475         /* update all cpuctx for this PMU */
10476         cpus_read_lock();
10477         for_each_online_cpu(cpu) {
10478                 struct perf_cpu_context *cpuctx;
10479                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
10480                 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
10481
10482                 cpu_function_call(cpu,
10483                         (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
10484         }
10485         cpus_read_unlock();
10486         mutex_unlock(&mux_interval_mutex);
10487
10488         return count;
10489 }
10490 static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
10491
10492 static struct attribute *pmu_dev_attrs[] = {
10493         &dev_attr_type.attr,
10494         &dev_attr_perf_event_mux_interval_ms.attr,
10495         NULL,
10496 };
10497 ATTRIBUTE_GROUPS(pmu_dev);
10498
10499 static int pmu_bus_running;
10500 static struct bus_type pmu_bus = {
10501         .name           = "event_source",
10502         .dev_groups     = pmu_dev_groups,
10503 };
10504
10505 static void pmu_dev_release(struct device *dev)
10506 {
10507         kfree(dev);
10508 }
10509
10510 static int pmu_dev_alloc(struct pmu *pmu)
10511 {
10512         int ret = -ENOMEM;
10513
10514         pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
10515         if (!pmu->dev)
10516                 goto out;
10517
10518         pmu->dev->groups = pmu->attr_groups;
10519         device_initialize(pmu->dev);
10520         ret = dev_set_name(pmu->dev, "%s", pmu->name);
10521         if (ret)
10522                 goto free_dev;
10523
10524         dev_set_drvdata(pmu->dev, pmu);
10525         pmu->dev->bus = &pmu_bus;
10526         pmu->dev->release = pmu_dev_release;
10527         ret = device_add(pmu->dev);
10528         if (ret)
10529                 goto free_dev;
10530
10531         /* For PMUs with address filters, throw in an extra attribute: */
10532         if (pmu->nr_addr_filters)
10533                 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
10534
10535         if (ret)
10536                 goto del_dev;
10537
10538         if (pmu->attr_update)
10539                 ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
10540
10541         if (ret)
10542                 goto del_dev;
10543
10544 out:
10545         return ret;
10546
10547 del_dev:
10548         device_del(pmu->dev);
10549
10550 free_dev:
10551         put_device(pmu->dev);
10552         goto out;
10553 }
10554
10555 static struct lock_class_key cpuctx_mutex;
10556 static struct lock_class_key cpuctx_lock;
10557
10558 int perf_pmu_register(struct pmu *pmu, const char *name, int type)
10559 {
10560         int cpu, ret, max = PERF_TYPE_MAX;
10561
10562         mutex_lock(&pmus_lock);
10563         ret = -ENOMEM;
10564         pmu->pmu_disable_count = alloc_percpu(int);
10565         if (!pmu->pmu_disable_count)
10566                 goto unlock;
10567
10568         pmu->type = -1;
10569         if (!name)
10570                 goto skip_type;
10571         pmu->name = name;
10572
10573         if (type != PERF_TYPE_SOFTWARE) {
10574                 if (type >= 0)
10575                         max = type;
10576
10577                 ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
10578                 if (ret < 0)
10579                         goto free_pdc;
10580
10581                 WARN_ON(type >= 0 && ret != type);
10582
10583                 type = ret;
10584         }
10585         pmu->type = type;
10586
10587         if (pmu_bus_running) {
10588                 ret = pmu_dev_alloc(pmu);
10589                 if (ret)
10590                         goto free_idr;
10591         }
10592
10593 skip_type:
10594         if (pmu->task_ctx_nr == perf_hw_context) {
10595                 static int hw_context_taken = 0;
10596
10597                 /*
10598                  * Other than systems with heterogeneous CPUs, it never makes
10599                  * sense for two PMUs to share perf_hw_context. PMUs which are
10600                  * uncore must use perf_invalid_context.
10601                  */
10602                 if (WARN_ON_ONCE(hw_context_taken &&
10603                     !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
10604                         pmu->task_ctx_nr = perf_invalid_context;
10605
10606                 hw_context_taken = 1;
10607         }
10608
10609         pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
10610         if (pmu->pmu_cpu_context)
10611                 goto got_cpu_context;
10612
10613         ret = -ENOMEM;
10614         pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
10615         if (!pmu->pmu_cpu_context)
10616                 goto free_dev;
10617
10618         for_each_possible_cpu(cpu) {
10619                 struct perf_cpu_context *cpuctx;
10620
10621                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
10622                 __perf_event_init_context(&cpuctx->ctx);
10623                 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
10624                 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
10625                 cpuctx->ctx.pmu = pmu;
10626                 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
10627
10628                 __perf_mux_hrtimer_init(cpuctx, cpu);
10629
10630                 cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
10631                 cpuctx->heap = cpuctx->heap_default;
10632         }
10633
10634 got_cpu_context:
10635         if (!pmu->start_txn) {
10636                 if (pmu->pmu_enable) {
10637                         /*
10638                          * If we have pmu_enable/pmu_disable calls, install
10639                          * transaction stubs that use that to try and batch
10640                          * hardware accesses.
10641                          */
10642                         pmu->start_txn  = perf_pmu_start_txn;
10643                         pmu->commit_txn = perf_pmu_commit_txn;
10644                         pmu->cancel_txn = perf_pmu_cancel_txn;
10645                 } else {
10646                         pmu->start_txn  = perf_pmu_nop_txn;
10647                         pmu->commit_txn = perf_pmu_nop_int;
10648                         pmu->cancel_txn = perf_pmu_nop_void;
10649                 }
10650         }
10651
10652         if (!pmu->pmu_enable) {
10653                 pmu->pmu_enable  = perf_pmu_nop_void;
10654                 pmu->pmu_disable = perf_pmu_nop_void;
10655         }
10656
10657         if (!pmu->check_period)
10658                 pmu->check_period = perf_event_nop_int;
10659
10660         if (!pmu->event_idx)
10661                 pmu->event_idx = perf_event_idx_default;
10662
10663         /*
10664          * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
10665          * since these cannot be in the IDR. This way the linear search
10666          * is fast, provided a valid software event is provided.
10667          */
10668         if (type == PERF_TYPE_SOFTWARE || !name)
10669                 list_add_rcu(&pmu->entry, &pmus);
10670         else
10671                 list_add_tail_rcu(&pmu->entry, &pmus);
10672
10673         atomic_set(&pmu->exclusive_cnt, 0);
10674         ret = 0;
10675 unlock:
10676         mutex_unlock(&pmus_lock);
10677
10678         return ret;
10679
10680 free_dev:
10681         device_del(pmu->dev);
10682         put_device(pmu->dev);
10683
10684 free_idr:
10685         if (pmu->type != PERF_TYPE_SOFTWARE)
10686                 idr_remove(&pmu_idr, pmu->type);
10687
10688 free_pdc:
10689         free_percpu(pmu->pmu_disable_count);
10690         goto unlock;
10691 }
10692 EXPORT_SYMBOL_GPL(perf_pmu_register);
10693
10694 void perf_pmu_unregister(struct pmu *pmu)
10695 {
10696         mutex_lock(&pmus_lock);
10697         list_del_rcu(&pmu->entry);
10698
10699         /*
10700          * We dereference the pmu list under both SRCU and regular RCU, so
10701          * synchronize against both of those.
10702          */
10703         synchronize_srcu(&pmus_srcu);
10704         synchronize_rcu();
10705
10706         free_percpu(pmu->pmu_disable_count);
10707         if (pmu->type != PERF_TYPE_SOFTWARE)
10708                 idr_remove(&pmu_idr, pmu->type);
10709         if (pmu_bus_running) {
10710                 if (pmu->nr_addr_filters)
10711                         device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
10712                 device_del(pmu->dev);
10713                 put_device(pmu->dev);
10714         }
10715         free_pmu_context(pmu);
10716         mutex_unlock(&pmus_lock);
10717 }
10718 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
10719
10720 static inline bool has_extended_regs(struct perf_event *event)
10721 {
10722         return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
10723                (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
10724 }
10725
10726 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
10727 {
10728         struct perf_event_context *ctx = NULL;
10729         int ret;
10730
10731         if (!try_module_get(pmu->module))
10732                 return -ENODEV;
10733
10734         /*
10735          * A number of pmu->event_init() methods iterate the sibling_list to,
10736          * for example, validate if the group fits on the PMU. Therefore,
10737          * if this is a sibling event, acquire the ctx->mutex to protect
10738          * the sibling_list.
10739          */
10740         if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
10741                 /*
10742                  * This ctx->mutex can nest when we're called through
10743                  * inheritance. See the perf_event_ctx_lock_nested() comment.
10744                  */
10745                 ctx = perf_event_ctx_lock_nested(event->group_leader,
10746                                                  SINGLE_DEPTH_NESTING);
10747                 BUG_ON(!ctx);
10748         }
10749
10750         event->pmu = pmu;
10751         ret = pmu->event_init(event);
10752
10753         if (ctx)
10754                 perf_event_ctx_unlock(event->group_leader, ctx);
10755
10756         if (!ret) {
10757                 if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
10758                     has_extended_regs(event))
10759                         ret = -EOPNOTSUPP;
10760
10761                 if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
10762                     event_has_any_exclude_flag(event))
10763                         ret = -EINVAL;
10764
10765                 if (ret && event->destroy)
10766                         event->destroy(event);
10767         }
10768
10769         if (ret)
10770                 module_put(pmu->module);
10771
10772         return ret;
10773 }
10774
10775 static struct pmu *perf_init_event(struct perf_event *event)
10776 {
10777         int idx, type, ret;
10778         struct pmu *pmu;
10779
10780         idx = srcu_read_lock(&pmus_srcu);
10781
10782         /* Try parent's PMU first: */
10783         if (event->parent && event->parent->pmu) {
10784                 pmu = event->parent->pmu;
10785                 ret = perf_try_init_event(pmu, event);
10786                 if (!ret)
10787                         goto unlock;
10788         }
10789
10790         /*
10791          * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
10792          * are often aliases for PERF_TYPE_RAW.
10793          */
10794         type = event->attr.type;
10795         if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
10796                 type = PERF_TYPE_RAW;
10797
10798 again:
10799         rcu_read_lock();
10800         pmu = idr_find(&pmu_idr, type);
10801         rcu_read_unlock();
10802         if (pmu) {
10803                 ret = perf_try_init_event(pmu, event);
10804                 if (ret == -ENOENT && event->attr.type != type) {
10805                         type = event->attr.type;
10806                         goto again;
10807                 }
10808
10809                 if (ret)
10810                         pmu = ERR_PTR(ret);
10811
10812                 goto unlock;
10813         }
10814
10815         list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
10816                 ret = perf_try_init_event(pmu, event);
10817                 if (!ret)
10818                         goto unlock;
10819
10820                 if (ret != -ENOENT) {
10821                         pmu = ERR_PTR(ret);
10822                         goto unlock;
10823                 }
10824         }
10825         pmu = ERR_PTR(-ENOENT);
10826 unlock:
10827         srcu_read_unlock(&pmus_srcu, idx);
10828
10829         return pmu;
10830 }
10831
10832 static void attach_sb_event(struct perf_event *event)
10833 {
10834         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
10835
10836         raw_spin_lock(&pel->lock);
10837         list_add_rcu(&event->sb_list, &pel->list);
10838         raw_spin_unlock(&pel->lock);
10839 }
10840
10841 /*
10842  * We keep a list of all !task (and therefore per-cpu) events
10843  * that need to receive side-band records.
10844  *
10845  * This avoids having to scan all the various PMU per-cpu contexts
10846  * looking for them.
10847  */
10848 static void account_pmu_sb_event(struct perf_event *event)
10849 {
10850         if (is_sb_event(event))
10851                 attach_sb_event(event);
10852 }
10853
10854 static void account_event_cpu(struct perf_event *event, int cpu)
10855 {
10856         if (event->parent)
10857                 return;
10858
10859         if (is_cgroup_event(event))
10860                 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
10861 }
10862
10863 /* Freq events need the tick to stay alive (see perf_event_task_tick). */
10864 static void account_freq_event_nohz(void)
10865 {
10866 #ifdef CONFIG_NO_HZ_FULL
10867         /* Lock so we don't race with concurrent unaccount */
10868         spin_lock(&nr_freq_lock);
10869         if (atomic_inc_return(&nr_freq_events) == 1)
10870                 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
10871         spin_unlock(&nr_freq_lock);
10872 #endif
10873 }
10874
10875 static void account_freq_event(void)
10876 {
10877         if (tick_nohz_full_enabled())
10878                 account_freq_event_nohz();
10879         else
10880                 atomic_inc(&nr_freq_events);
10881 }
10882
10883
10884 static void account_event(struct perf_event *event)
10885 {
10886         bool inc = false;
10887
10888         if (event->parent)
10889                 return;
10890
10891         if (event->attach_state & PERF_ATTACH_TASK)
10892                 inc = true;
10893         if (event->attr.mmap || event->attr.mmap_data)
10894                 atomic_inc(&nr_mmap_events);
10895         if (event->attr.comm)
10896                 atomic_inc(&nr_comm_events);
10897         if (event->attr.namespaces)
10898                 atomic_inc(&nr_namespaces_events);
10899         if (event->attr.cgroup)
10900                 atomic_inc(&nr_cgroup_events);
10901         if (event->attr.task)
10902                 atomic_inc(&nr_task_events);
10903         if (event->attr.freq)
10904                 account_freq_event();
10905         if (event->attr.context_switch) {
10906                 atomic_inc(&nr_switch_events);
10907                 inc = true;
10908         }
10909         if (has_branch_stack(event))
10910                 inc = true;
10911         if (is_cgroup_event(event))
10912                 inc = true;
10913         if (event->attr.ksymbol)
10914                 atomic_inc(&nr_ksymbol_events);
10915         if (event->attr.bpf_event)
10916                 atomic_inc(&nr_bpf_events);
10917
10918         if (inc) {
10919                 /*
10920                  * We need the mutex here because static_branch_enable()
10921                  * must complete *before* the perf_sched_count increment
10922                  * becomes visible.
10923                  */
10924                 if (atomic_inc_not_zero(&perf_sched_count))
10925                         goto enabled;
10926
10927                 mutex_lock(&perf_sched_mutex);
10928                 if (!atomic_read(&perf_sched_count)) {
10929                         static_branch_enable(&perf_sched_events);
10930                         /*
10931                          * Guarantee that all CPUs observe they key change and
10932                          * call the perf scheduling hooks before proceeding to
10933                          * install events that need them.
10934                          */
10935                         synchronize_rcu();
10936                 }
10937                 /*
10938                  * Now that we have waited for the sync_sched(), allow further
10939                  * increments to by-pass the mutex.
10940                  */
10941                 atomic_inc(&perf_sched_count);
10942                 mutex_unlock(&perf_sched_mutex);
10943         }
10944 enabled:
10945
10946         account_event_cpu(event, event->cpu);
10947
10948         account_pmu_sb_event(event);
10949 }
10950
10951 /*
10952  * Allocate and initialize an event structure
10953  */
10954 static struct perf_event *
10955 perf_event_alloc(struct perf_event_attr *attr, int cpu,
10956                  struct task_struct *task,
10957                  struct perf_event *group_leader,
10958                  struct perf_event *parent_event,
10959                  perf_overflow_handler_t overflow_handler,
10960                  void *context, int cgroup_fd)
10961 {
10962         struct pmu *pmu;
10963         struct perf_event *event;
10964         struct hw_perf_event *hwc;
10965         long err = -EINVAL;
10966
10967         if ((unsigned)cpu >= nr_cpu_ids) {
10968                 if (!task || cpu != -1)
10969                         return ERR_PTR(-EINVAL);
10970         }
10971
10972         event = kzalloc(sizeof(*event), GFP_KERNEL);
10973         if (!event)
10974                 return ERR_PTR(-ENOMEM);
10975
10976         /*
10977          * Single events are their own group leaders, with an
10978          * empty sibling list:
10979          */
10980         if (!group_leader)
10981                 group_leader = event;
10982
10983         mutex_init(&event->child_mutex);
10984         INIT_LIST_HEAD(&event->child_list);
10985
10986         INIT_LIST_HEAD(&event->event_entry);
10987         INIT_LIST_HEAD(&event->sibling_list);
10988         INIT_LIST_HEAD(&event->active_list);
10989         init_event_group(event);
10990         INIT_LIST_HEAD(&event->rb_entry);
10991         INIT_LIST_HEAD(&event->active_entry);
10992         INIT_LIST_HEAD(&event->addr_filters.list);
10993         INIT_HLIST_NODE(&event->hlist_entry);
10994
10995
10996         init_waitqueue_head(&event->waitq);
10997         event->pending_disable = -1;
10998         init_irq_work(&event->pending, perf_pending_event);
10999
11000         mutex_init(&event->mmap_mutex);
11001         raw_spin_lock_init(&event->addr_filters.lock);
11002
11003         atomic_long_set(&event->refcount, 1);
11004         event->cpu              = cpu;
11005         event->attr             = *attr;
11006         event->group_leader     = group_leader;
11007         event->pmu              = NULL;
11008         event->oncpu            = -1;
11009
11010         event->parent           = parent_event;
11011
11012         event->ns               = get_pid_ns(task_active_pid_ns(current));
11013         event->id               = atomic64_inc_return(&perf_event_id);
11014
11015         event->state            = PERF_EVENT_STATE_INACTIVE;
11016
11017         if (task) {
11018                 event->attach_state = PERF_ATTACH_TASK;
11019                 /*
11020                  * XXX pmu::event_init needs to know what task to account to
11021                  * and we cannot use the ctx information because we need the
11022                  * pmu before we get a ctx.
11023                  */
11024                 event->hw.target = get_task_struct(task);
11025         }
11026
11027         event->clock = &local_clock;
11028         if (parent_event)
11029                 event->clock = parent_event->clock;
11030
11031         if (!overflow_handler && parent_event) {
11032                 overflow_handler = parent_event->overflow_handler;
11033                 context = parent_event->overflow_handler_context;
11034 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
11035                 if (overflow_handler == bpf_overflow_handler) {
11036                         struct bpf_prog *prog = parent_event->prog;
11037
11038                         bpf_prog_inc(prog);
11039                         event->prog = prog;
11040                         event->orig_overflow_handler =
11041                                 parent_event->orig_overflow_handler;
11042                 }
11043 #endif
11044         }
11045
11046         if (overflow_handler) {
11047                 event->overflow_handler = overflow_handler;
11048                 event->overflow_handler_context = context;
11049         } else if (is_write_backward(event)){
11050                 event->overflow_handler = perf_event_output_backward;
11051                 event->overflow_handler_context = NULL;
11052         } else {
11053                 event->overflow_handler = perf_event_output_forward;
11054                 event->overflow_handler_context = NULL;
11055         }
11056
11057         perf_event__state_init(event);
11058
11059         pmu = NULL;
11060
11061         hwc = &event->hw;
11062         hwc->sample_period = attr->sample_period;
11063         if (attr->freq && attr->sample_freq)
11064                 hwc->sample_period = 1;
11065         hwc->last_period = hwc->sample_period;
11066
11067         local64_set(&hwc->period_left, hwc->sample_period);
11068
11069         /*
11070          * We currently do not support PERF_SAMPLE_READ on inherited events.
11071          * See perf_output_read().
11072          */
11073         if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
11074                 goto err_ns;
11075
11076         if (!has_branch_stack(event))
11077                 event->attr.branch_sample_type = 0;
11078
11079         pmu = perf_init_event(event);
11080         if (IS_ERR(pmu)) {
11081                 err = PTR_ERR(pmu);
11082                 goto err_ns;
11083         }
11084
11085         /*
11086          * Disallow uncore-cgroup events, they don't make sense as the cgroup will
11087          * be different on other CPUs in the uncore mask.
11088          */
11089         if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
11090                 err = -EINVAL;
11091                 goto err_pmu;
11092         }
11093
11094         if (event->attr.aux_output &&
11095             !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
11096                 err = -EOPNOTSUPP;
11097                 goto err_pmu;
11098         }
11099
11100         if (cgroup_fd != -1) {
11101                 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
11102                 if (err)
11103                         goto err_pmu;
11104         }
11105
11106         err = exclusive_event_init(event);
11107         if (err)
11108                 goto err_pmu;
11109
11110         if (has_addr_filter(event)) {
11111                 event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
11112                                                     sizeof(struct perf_addr_filter_range),
11113                                                     GFP_KERNEL);
11114                 if (!event->addr_filter_ranges) {
11115                         err = -ENOMEM;
11116                         goto err_per_task;
11117                 }
11118
11119                 /*
11120                  * Clone the parent's vma offsets: they are valid until exec()
11121                  * even if the mm is not shared with the parent.
11122                  */
11123                 if (event->parent) {
11124                         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
11125
11126                         raw_spin_lock_irq(&ifh->lock);
11127                         memcpy(event->addr_filter_ranges,
11128                                event->parent->addr_filter_ranges,
11129                                pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
11130                         raw_spin_unlock_irq(&ifh->lock);
11131                 }
11132
11133                 /* force hw sync on the address filters */
11134                 event->addr_filters_gen = 1;
11135         }
11136
11137         if (!event->parent) {
11138                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
11139                         err = get_callchain_buffers(attr->sample_max_stack);
11140                         if (err)
11141                                 goto err_addr_filters;
11142                 }
11143         }
11144
11145         err = security_perf_event_alloc(event);
11146         if (err)
11147                 goto err_callchain_buffer;
11148
11149         /* symmetric to unaccount_event() in _free_event() */
11150         account_event(event);
11151
11152         return event;
11153
11154 err_callchain_buffer:
11155         if (!event->parent) {
11156                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
11157                         put_callchain_buffers();
11158         }
11159 err_addr_filters:
11160         kfree(event->addr_filter_ranges);
11161
11162 err_per_task:
11163         exclusive_event_destroy(event);
11164
11165 err_pmu:
11166         if (is_cgroup_event(event))
11167                 perf_detach_cgroup(event);
11168         if (event->destroy)
11169                 event->destroy(event);
11170         module_put(pmu->module);
11171 err_ns:
11172         if (event->ns)
11173                 put_pid_ns(event->ns);
11174         if (event->hw.target)
11175                 put_task_struct(event->hw.target);
11176         kfree(event);
11177
11178         return ERR_PTR(err);
11179 }
11180
11181 static int perf_copy_attr(struct perf_event_attr __user *uattr,
11182                           struct perf_event_attr *attr)
11183 {
11184         u32 size;
11185         int ret;
11186
11187         /* Zero the full structure, so that a short copy will be nice. */
11188         memset(attr, 0, sizeof(*attr));
11189
11190         ret = get_user(size, &uattr->size);
11191         if (ret)
11192                 return ret;
11193
11194         /* ABI compatibility quirk: */
11195         if (!size)
11196                 size = PERF_ATTR_SIZE_VER0;
11197         if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
11198                 goto err_size;
11199
11200         ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
11201         if (ret) {
11202                 if (ret == -E2BIG)
11203                         goto err_size;
11204                 return ret;
11205         }
11206
11207         attr->size = size;
11208
11209         if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
11210                 return -EINVAL;
11211
11212         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
11213                 return -EINVAL;
11214
11215         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
11216                 return -EINVAL;
11217
11218         if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
11219                 u64 mask = attr->branch_sample_type;
11220
11221                 /* only using defined bits */
11222                 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
11223                         return -EINVAL;
11224
11225                 /* at least one branch bit must be set */
11226                 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
11227                         return -EINVAL;
11228
11229                 /* propagate priv level, when not set for branch */
11230                 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
11231
11232                         /* exclude_kernel checked on syscall entry */
11233                         if (!attr->exclude_kernel)
11234                                 mask |= PERF_SAMPLE_BRANCH_KERNEL;
11235
11236                         if (!attr->exclude_user)
11237                                 mask |= PERF_SAMPLE_BRANCH_USER;
11238
11239                         if (!attr->exclude_hv)
11240                                 mask |= PERF_SAMPLE_BRANCH_HV;
11241                         /*
11242                          * adjust user setting (for HW filter setup)
11243                          */
11244                         attr->branch_sample_type = mask;
11245                 }
11246                 /* privileged levels capture (kernel, hv): check permissions */
11247                 if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
11248                         ret = perf_allow_kernel(attr);
11249                         if (ret)
11250                                 return ret;
11251                 }
11252         }
11253
11254         if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
11255                 ret = perf_reg_validate(attr->sample_regs_user);
11256                 if (ret)
11257                         return ret;
11258         }
11259
11260         if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
11261                 if (!arch_perf_have_user_stack_dump())
11262                         return -ENOSYS;
11263
11264                 /*
11265                  * We have __u32 type for the size, but so far
11266                  * we can only use __u16 as maximum due to the
11267                  * __u16 sample size limit.
11268                  */
11269                 if (attr->sample_stack_user >= USHRT_MAX)
11270                         return -EINVAL;
11271                 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
11272                         return -EINVAL;
11273         }
11274
11275         if (!attr->sample_max_stack)
11276                 attr->sample_max_stack = sysctl_perf_event_max_stack;
11277
11278         if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
11279                 ret = perf_reg_validate(attr->sample_regs_intr);
11280
11281 #ifndef CONFIG_CGROUP_PERF
11282         if (attr->sample_type & PERF_SAMPLE_CGROUP)
11283                 return -EINVAL;
11284 #endif
11285
11286 out:
11287         return ret;
11288
11289 err_size:
11290         put_user(sizeof(*attr), &uattr->size);
11291         ret = -E2BIG;
11292         goto out;
11293 }
11294
11295 static int
11296 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
11297 {
11298         struct perf_buffer *rb = NULL;
11299         int ret = -EINVAL;
11300
11301         if (!output_event)
11302                 goto set;
11303
11304         /* don't allow circular references */
11305         if (event == output_event)
11306                 goto out;
11307
11308         /*
11309          * Don't allow cross-cpu buffers
11310          */
11311         if (output_event->cpu != event->cpu)
11312                 goto out;
11313
11314         /*
11315          * If its not a per-cpu rb, it must be the same task.
11316          */
11317         if (output_event->cpu == -1 && output_event->ctx != event->ctx)
11318                 goto out;
11319
11320         /*
11321          * Mixing clocks in the same buffer is trouble you don't need.
11322          */
11323         if (output_event->clock != event->clock)
11324                 goto out;
11325
11326         /*
11327          * Either writing ring buffer from beginning or from end.
11328          * Mixing is not allowed.
11329          */
11330         if (is_write_backward(output_event) != is_write_backward(event))
11331                 goto out;
11332
11333         /*
11334          * If both events generate aux data, they must be on the same PMU
11335          */
11336         if (has_aux(event) && has_aux(output_event) &&
11337             event->pmu != output_event->pmu)
11338                 goto out;
11339
11340 set:
11341         mutex_lock(&event->mmap_mutex);
11342         /* Can't redirect output if we've got an active mmap() */
11343         if (atomic_read(&event->mmap_count))
11344                 goto unlock;
11345
11346         if (output_event) {
11347                 /* get the rb we want to redirect to */
11348                 rb = ring_buffer_get(output_event);
11349                 if (!rb)
11350                         goto unlock;
11351         }
11352
11353         ring_buffer_attach(event, rb);
11354
11355         ret = 0;
11356 unlock:
11357         mutex_unlock(&event->mmap_mutex);
11358
11359 out:
11360         return ret;
11361 }
11362
11363 static void mutex_lock_double(struct mutex *a, struct mutex *b)
11364 {
11365         if (b < a)
11366                 swap(a, b);
11367
11368         mutex_lock(a);
11369         mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
11370 }
11371
11372 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
11373 {
11374         bool nmi_safe = false;
11375
11376         switch (clk_id) {
11377         case CLOCK_MONOTONIC:
11378                 event->clock = &ktime_get_mono_fast_ns;
11379                 nmi_safe = true;
11380                 break;
11381
11382         case CLOCK_MONOTONIC_RAW:
11383                 event->clock = &ktime_get_raw_fast_ns;
11384                 nmi_safe = true;
11385                 break;
11386
11387         case CLOCK_REALTIME:
11388                 event->clock = &ktime_get_real_ns;
11389                 break;
11390
11391         case CLOCK_BOOTTIME:
11392                 event->clock = &ktime_get_boottime_ns;
11393                 break;
11394
11395         case CLOCK_TAI:
11396                 event->clock = &ktime_get_clocktai_ns;
11397                 break;
11398
11399         default:
11400                 return -EINVAL;
11401         }
11402
11403         if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
11404                 return -EINVAL;
11405
11406         return 0;
11407 }
11408
11409 /*
11410  * Variation on perf_event_ctx_lock_nested(), except we take two context
11411  * mutexes.
11412  */
11413 static struct perf_event_context *
11414 __perf_event_ctx_lock_double(struct perf_event *group_leader,
11415                              struct perf_event_context *ctx)
11416 {
11417         struct perf_event_context *gctx;
11418
11419 again:
11420         rcu_read_lock();
11421         gctx = READ_ONCE(group_leader->ctx);
11422         if (!refcount_inc_not_zero(&gctx->refcount)) {
11423                 rcu_read_unlock();
11424                 goto again;
11425         }
11426         rcu_read_unlock();
11427
11428         mutex_lock_double(&gctx->mutex, &ctx->mutex);
11429
11430         if (group_leader->ctx != gctx) {
11431                 mutex_unlock(&ctx->mutex);
11432                 mutex_unlock(&gctx->mutex);
11433                 put_ctx(gctx);
11434                 goto again;
11435         }
11436
11437         return gctx;
11438 }
11439
11440 /**
11441  * sys_perf_event_open - open a performance event, associate it to a task/cpu
11442  *
11443  * @attr_uptr:  event_id type attributes for monitoring/sampling
11444  * @pid:                target pid
11445  * @cpu:                target cpu
11446  * @group_fd:           group leader event fd
11447  */
11448 SYSCALL_DEFINE5(perf_event_open,
11449                 struct perf_event_attr __user *, attr_uptr,
11450                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
11451 {
11452         struct perf_event *group_leader = NULL, *output_event = NULL;
11453         struct perf_event *event, *sibling;
11454         struct perf_event_attr attr;
11455         struct perf_event_context *ctx, *uninitialized_var(gctx);
11456         struct file *event_file = NULL;
11457         struct fd group = {NULL, 0};
11458         struct task_struct *task = NULL;
11459         struct pmu *pmu;
11460         int event_fd;
11461         int move_group = 0;
11462         int err;
11463         int f_flags = O_RDWR;
11464         int cgroup_fd = -1;
11465
11466         /* for future expandability... */
11467         if (flags & ~PERF_FLAG_ALL)
11468                 return -EINVAL;
11469
11470         /* Do we allow access to perf_event_open(2) ? */
11471         err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
11472         if (err)
11473                 return err;
11474
11475         err = perf_copy_attr(attr_uptr, &attr);
11476         if (err)
11477                 return err;
11478
11479         if (!attr.exclude_kernel) {
11480                 err = perf_allow_kernel(&attr);
11481                 if (err)
11482                         return err;
11483         }
11484
11485         if (attr.namespaces) {
11486                 if (!capable(CAP_SYS_ADMIN))
11487                         return -EACCES;
11488         }
11489
11490         if (attr.freq) {
11491                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
11492                         return -EINVAL;
11493         } else {
11494                 if (attr.sample_period & (1ULL << 63))
11495                         return -EINVAL;
11496         }
11497
11498         /* Only privileged users can get physical addresses */
11499         if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
11500                 err = perf_allow_kernel(&attr);
11501                 if (err)
11502                         return err;
11503         }
11504
11505         err = security_locked_down(LOCKDOWN_PERF);
11506         if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR))
11507                 /* REGS_INTR can leak data, lockdown must prevent this */
11508                 return err;
11509
11510         err = 0;
11511
11512         /*
11513          * In cgroup mode, the pid argument is used to pass the fd
11514          * opened to the cgroup directory in cgroupfs. The cpu argument
11515          * designates the cpu on which to monitor threads from that
11516          * cgroup.
11517          */
11518         if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
11519                 return -EINVAL;
11520
11521         if (flags & PERF_FLAG_FD_CLOEXEC)
11522                 f_flags |= O_CLOEXEC;
11523
11524         event_fd = get_unused_fd_flags(f_flags);
11525         if (event_fd < 0)
11526                 return event_fd;
11527
11528         if (group_fd != -1) {
11529                 err = perf_fget_light(group_fd, &group);
11530                 if (err)
11531                         goto err_fd;
11532                 group_leader = group.file->private_data;
11533                 if (flags & PERF_FLAG_FD_OUTPUT)
11534                         output_event = group_leader;
11535                 if (flags & PERF_FLAG_FD_NO_GROUP)
11536                         group_leader = NULL;
11537         }
11538
11539         if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
11540                 task = find_lively_task_by_vpid(pid);
11541                 if (IS_ERR(task)) {
11542                         err = PTR_ERR(task);
11543                         goto err_group_fd;
11544                 }
11545         }
11546
11547         if (task && group_leader &&
11548             group_leader->attr.inherit != attr.inherit) {
11549                 err = -EINVAL;
11550                 goto err_task;
11551         }
11552
11553         if (task) {
11554                 err = mutex_lock_interruptible(&task->signal->exec_update_mutex);
11555                 if (err)
11556                         goto err_task;
11557
11558                 /*
11559                  * Reuse ptrace permission checks for now.
11560                  *
11561                  * We must hold exec_update_mutex across this and any potential
11562                  * perf_install_in_context() call for this new event to
11563                  * serialize against exec() altering our credentials (and the
11564                  * perf_event_exit_task() that could imply).
11565                  */
11566                 err = -EACCES;
11567                 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
11568                         goto err_cred;
11569         }
11570
11571         if (flags & PERF_FLAG_PID_CGROUP)
11572                 cgroup_fd = pid;
11573
11574         event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
11575                                  NULL, NULL, cgroup_fd);
11576         if (IS_ERR(event)) {
11577                 err = PTR_ERR(event);
11578                 goto err_cred;
11579         }
11580
11581         if (is_sampling_event(event)) {
11582                 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
11583                         err = -EOPNOTSUPP;
11584                         goto err_alloc;
11585                 }
11586         }
11587
11588         /*
11589          * Special case software events and allow them to be part of
11590          * any hardware group.
11591          */
11592         pmu = event->pmu;
11593
11594         if (attr.use_clockid) {
11595                 err = perf_event_set_clock(event, attr.clockid);
11596                 if (err)
11597                         goto err_alloc;
11598         }
11599
11600         if (pmu->task_ctx_nr == perf_sw_context)
11601                 event->event_caps |= PERF_EV_CAP_SOFTWARE;
11602
11603         if (group_leader) {
11604                 if (is_software_event(event) &&
11605                     !in_software_context(group_leader)) {
11606                         /*
11607                          * If the event is a sw event, but the group_leader
11608                          * is on hw context.
11609                          *
11610                          * Allow the addition of software events to hw
11611                          * groups, this is safe because software events
11612                          * never fail to schedule.
11613                          */
11614                         pmu = group_leader->ctx->pmu;
11615                 } else if (!is_software_event(event) &&
11616                            is_software_event(group_leader) &&
11617                            (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
11618                         /*
11619                          * In case the group is a pure software group, and we
11620                          * try to add a hardware event, move the whole group to
11621                          * the hardware context.
11622                          */
11623                         move_group = 1;
11624                 }
11625         }
11626
11627         /*
11628          * Get the target context (task or percpu):
11629          */
11630         ctx = find_get_context(pmu, task, event);
11631         if (IS_ERR(ctx)) {
11632                 err = PTR_ERR(ctx);
11633                 goto err_alloc;
11634         }
11635
11636         /*
11637          * Look up the group leader (we will attach this event to it):
11638          */
11639         if (group_leader) {
11640                 err = -EINVAL;
11641
11642                 /*
11643                  * Do not allow a recursive hierarchy (this new sibling
11644                  * becoming part of another group-sibling):
11645                  */
11646                 if (group_leader->group_leader != group_leader)
11647                         goto err_context;
11648
11649                 /* All events in a group should have the same clock */
11650                 if (group_leader->clock != event->clock)
11651                         goto err_context;
11652
11653                 /*
11654                  * Make sure we're both events for the same CPU;
11655                  * grouping events for different CPUs is broken; since
11656                  * you can never concurrently schedule them anyhow.
11657                  */
11658                 if (group_leader->cpu != event->cpu)
11659                         goto err_context;
11660
11661                 /*
11662                  * Make sure we're both on the same task, or both
11663                  * per-CPU events.
11664                  */
11665                 if (group_leader->ctx->task != ctx->task)
11666                         goto err_context;
11667
11668                 /*
11669                  * Do not allow to attach to a group in a different task
11670                  * or CPU context. If we're moving SW events, we'll fix
11671                  * this up later, so allow that.
11672                  */
11673                 if (!move_group && group_leader->ctx != ctx)
11674                         goto err_context;
11675
11676                 /*
11677                  * Only a group leader can be exclusive or pinned
11678                  */
11679                 if (attr.exclusive || attr.pinned)
11680                         goto err_context;
11681         }
11682
11683         if (output_event) {
11684                 err = perf_event_set_output(event, output_event);
11685                 if (err)
11686                         goto err_context;
11687         }
11688
11689         event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
11690                                         f_flags);
11691         if (IS_ERR(event_file)) {
11692                 err = PTR_ERR(event_file);
11693                 event_file = NULL;
11694                 goto err_context;
11695         }
11696
11697         if (move_group) {
11698                 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
11699
11700                 if (gctx->task == TASK_TOMBSTONE) {
11701                         err = -ESRCH;
11702                         goto err_locked;
11703                 }
11704
11705                 /*
11706                  * Check if we raced against another sys_perf_event_open() call
11707                  * moving the software group underneath us.
11708                  */
11709                 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
11710                         /*
11711                          * If someone moved the group out from under us, check
11712                          * if this new event wound up on the same ctx, if so
11713                          * its the regular !move_group case, otherwise fail.
11714                          */
11715                         if (gctx != ctx) {
11716                                 err = -EINVAL;
11717                                 goto err_locked;
11718                         } else {
11719                                 perf_event_ctx_unlock(group_leader, gctx);
11720                                 move_group = 0;
11721                         }
11722                 }
11723
11724                 /*
11725                  * Failure to create exclusive events returns -EBUSY.
11726                  */
11727                 err = -EBUSY;
11728                 if (!exclusive_event_installable(group_leader, ctx))
11729                         goto err_locked;
11730
11731                 for_each_sibling_event(sibling, group_leader) {
11732                         if (!exclusive_event_installable(sibling, ctx))
11733                                 goto err_locked;
11734                 }
11735         } else {
11736                 mutex_lock(&ctx->mutex);
11737         }
11738
11739         if (ctx->task == TASK_TOMBSTONE) {
11740                 err = -ESRCH;
11741                 goto err_locked;
11742         }
11743
11744         if (!perf_event_validate_size(event)) {
11745                 err = -E2BIG;
11746                 goto err_locked;
11747         }
11748
11749         if (!task) {
11750                 /*
11751                  * Check if the @cpu we're creating an event for is online.
11752                  *
11753                  * We use the perf_cpu_context::ctx::mutex to serialize against
11754                  * the hotplug notifiers. See perf_event_{init,exit}_cpu().
11755                  */
11756                 struct perf_cpu_context *cpuctx =
11757                         container_of(ctx, struct perf_cpu_context, ctx);
11758
11759                 if (!cpuctx->online) {
11760                         err = -ENODEV;
11761                         goto err_locked;
11762                 }
11763         }
11764
11765         if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
11766                 err = -EINVAL;
11767                 goto err_locked;
11768         }
11769
11770         /*
11771          * Must be under the same ctx::mutex as perf_install_in_context(),
11772          * because we need to serialize with concurrent event creation.
11773          */
11774         if (!exclusive_event_installable(event, ctx)) {
11775                 err = -EBUSY;
11776                 goto err_locked;
11777         }
11778
11779         WARN_ON_ONCE(ctx->parent_ctx);
11780
11781         /*
11782          * This is the point on no return; we cannot fail hereafter. This is
11783          * where we start modifying current state.
11784          */
11785
11786         if (move_group) {
11787                 /*
11788                  * See perf_event_ctx_lock() for comments on the details
11789                  * of swizzling perf_event::ctx.
11790                  */
11791                 perf_remove_from_context(group_leader, 0);
11792                 put_ctx(gctx);
11793
11794                 for_each_sibling_event(sibling, group_leader) {
11795                         perf_remove_from_context(sibling, 0);
11796                         put_ctx(gctx);
11797                 }
11798
11799                 /*
11800                  * Wait for everybody to stop referencing the events through
11801                  * the old lists, before installing it on new lists.
11802                  */
11803                 synchronize_rcu();
11804
11805                 /*
11806                  * Install the group siblings before the group leader.
11807                  *
11808                  * Because a group leader will try and install the entire group
11809                  * (through the sibling list, which is still in-tact), we can
11810                  * end up with siblings installed in the wrong context.
11811                  *
11812                  * By installing siblings first we NO-OP because they're not
11813                  * reachable through the group lists.
11814                  */
11815                 for_each_sibling_event(sibling, group_leader) {
11816                         perf_event__state_init(sibling);
11817                         perf_install_in_context(ctx, sibling, sibling->cpu);
11818                         get_ctx(ctx);
11819                 }
11820
11821                 /*
11822                  * Removing from the context ends up with disabled
11823                  * event. What we want here is event in the initial
11824                  * startup state, ready to be add into new context.
11825                  */
11826                 perf_event__state_init(group_leader);
11827                 perf_install_in_context(ctx, group_leader, group_leader->cpu);
11828                 get_ctx(ctx);
11829         }
11830
11831         /*
11832          * Precalculate sample_data sizes; do while holding ctx::mutex such
11833          * that we're serialized against further additions and before
11834          * perf_install_in_context() which is the point the event is active and
11835          * can use these values.
11836          */
11837         perf_event__header_size(event);
11838         perf_event__id_header_size(event);
11839
11840         event->owner = current;
11841
11842         perf_install_in_context(ctx, event, event->cpu);
11843         perf_unpin_context(ctx);
11844
11845         if (move_group)
11846                 perf_event_ctx_unlock(group_leader, gctx);
11847         mutex_unlock(&ctx->mutex);
11848
11849         if (task) {
11850                 mutex_unlock(&task->signal->exec_update_mutex);
11851                 put_task_struct(task);
11852         }
11853
11854         mutex_lock(&current->perf_event_mutex);
11855         list_add_tail(&event->owner_entry, &current->perf_event_list);
11856         mutex_unlock(&current->perf_event_mutex);
11857
11858         /*
11859          * Drop the reference on the group_event after placing the
11860          * new event on the sibling_list. This ensures destruction
11861          * of the group leader will find the pointer to itself in
11862          * perf_group_detach().
11863          */
11864         fdput(group);
11865         fd_install(event_fd, event_file);
11866         return event_fd;
11867
11868 err_locked:
11869         if (move_group)
11870                 perf_event_ctx_unlock(group_leader, gctx);
11871         mutex_unlock(&ctx->mutex);
11872 /* err_file: */
11873         fput(event_file);
11874 err_context:
11875         perf_unpin_context(ctx);
11876         put_ctx(ctx);
11877 err_alloc:
11878         /*
11879          * If event_file is set, the fput() above will have called ->release()
11880          * and that will take care of freeing the event.
11881          */
11882         if (!event_file)
11883                 free_event(event);
11884 err_cred:
11885         if (task)
11886                 mutex_unlock(&task->signal->exec_update_mutex);
11887 err_task:
11888         if (task)
11889                 put_task_struct(task);
11890 err_group_fd:
11891         fdput(group);
11892 err_fd:
11893         put_unused_fd(event_fd);
11894         return err;
11895 }
11896
11897 /**
11898  * perf_event_create_kernel_counter
11899  *
11900  * @attr: attributes of the counter to create
11901  * @cpu: cpu in which the counter is bound
11902  * @task: task to profile (NULL for percpu)
11903  */
11904 struct perf_event *
11905 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
11906                                  struct task_struct *task,
11907                                  perf_overflow_handler_t overflow_handler,
11908                                  void *context)
11909 {
11910         struct perf_event_context *ctx;
11911         struct perf_event *event;
11912         int err;
11913
11914         /*
11915          * Grouping is not supported for kernel events, neither is 'AUX',
11916          * make sure the caller's intentions are adjusted.
11917          */
11918         if (attr->aux_output)
11919                 return ERR_PTR(-EINVAL);
11920
11921         event = perf_event_alloc(attr, cpu, task, NULL, NULL,
11922                                  overflow_handler, context, -1);
11923         if (IS_ERR(event)) {
11924                 err = PTR_ERR(event);
11925                 goto err;
11926         }
11927
11928         /* Mark owner so we could distinguish it from user events. */
11929         event->owner = TASK_TOMBSTONE;
11930
11931         /*
11932          * Get the target context (task or percpu):
11933          */
11934         ctx = find_get_context(event->pmu, task, event);
11935         if (IS_ERR(ctx)) {
11936                 err = PTR_ERR(ctx);
11937                 goto err_free;
11938         }
11939
11940         WARN_ON_ONCE(ctx->parent_ctx);
11941         mutex_lock(&ctx->mutex);
11942         if (ctx->task == TASK_TOMBSTONE) {
11943                 err = -ESRCH;
11944                 goto err_unlock;
11945         }
11946
11947         if (!task) {
11948                 /*
11949                  * Check if the @cpu we're creating an event for is online.
11950                  *
11951                  * We use the perf_cpu_context::ctx::mutex to serialize against
11952                  * the hotplug notifiers. See perf_event_{init,exit}_cpu().
11953                  */
11954                 struct perf_cpu_context *cpuctx =
11955                         container_of(ctx, struct perf_cpu_context, ctx);
11956                 if (!cpuctx->online) {
11957                         err = -ENODEV;
11958                         goto err_unlock;
11959                 }
11960         }
11961
11962         if (!exclusive_event_installable(event, ctx)) {
11963                 err = -EBUSY;
11964                 goto err_unlock;
11965         }
11966
11967         perf_install_in_context(ctx, event, event->cpu);
11968         perf_unpin_context(ctx);
11969         mutex_unlock(&ctx->mutex);
11970
11971         return event;
11972
11973 err_unlock:
11974         mutex_unlock(&ctx->mutex);
11975         perf_unpin_context(ctx);
11976         put_ctx(ctx);
11977 err_free:
11978         free_event(event);
11979 err:
11980         return ERR_PTR(err);
11981 }
11982 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
11983
11984 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
11985 {
11986         struct perf_event_context *src_ctx;
11987         struct perf_event_context *dst_ctx;
11988         struct perf_event *event, *tmp;
11989         LIST_HEAD(events);
11990
11991         src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
11992         dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
11993
11994         /*
11995          * See perf_event_ctx_lock() for comments on the details
11996          * of swizzling perf_event::ctx.
11997          */
11998         mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
11999         list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
12000                                  event_entry) {
12001                 perf_remove_from_context(event, 0);
12002                 unaccount_event_cpu(event, src_cpu);
12003                 put_ctx(src_ctx);
12004                 list_add(&event->migrate_entry, &events);
12005         }
12006
12007         /*
12008          * Wait for the events to quiesce before re-instating them.
12009          */
12010         synchronize_rcu();
12011
12012         /*
12013          * Re-instate events in 2 passes.
12014          *
12015          * Skip over group leaders and only install siblings on this first
12016          * pass, siblings will not get enabled without a leader, however a
12017          * leader will enable its siblings, even if those are still on the old
12018          * context.
12019          */
12020         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
12021                 if (event->group_leader == event)
12022                         continue;
12023
12024                 list_del(&event->migrate_entry);
12025                 if (event->state >= PERF_EVENT_STATE_OFF)
12026                         event->state = PERF_EVENT_STATE_INACTIVE;
12027                 account_event_cpu(event, dst_cpu);
12028                 perf_install_in_context(dst_ctx, event, dst_cpu);
12029                 get_ctx(dst_ctx);
12030         }
12031
12032         /*
12033          * Once all the siblings are setup properly, install the group leaders
12034          * to make it go.
12035          */
12036         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
12037                 list_del(&event->migrate_entry);
12038                 if (event->state >= PERF_EVENT_STATE_OFF)
12039                         event->state = PERF_EVENT_STATE_INACTIVE;
12040                 account_event_cpu(event, dst_cpu);
12041                 perf_install_in_context(dst_ctx, event, dst_cpu);
12042                 get_ctx(dst_ctx);
12043         }
12044         mutex_unlock(&dst_ctx->mutex);
12045         mutex_unlock(&src_ctx->mutex);
12046 }
12047 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
12048
12049 static void sync_child_event(struct perf_event *child_event,
12050                                struct task_struct *child)
12051 {
12052         struct perf_event *parent_event = child_event->parent;
12053         u64 child_val;
12054
12055         if (child_event->attr.inherit_stat)
12056                 perf_event_read_event(child_event, child);
12057
12058         child_val = perf_event_count(child_event);
12059
12060         /*
12061          * Add back the child's count to the parent's count:
12062          */
12063         atomic64_add(child_val, &parent_event->child_count);
12064         atomic64_add(child_event->total_time_enabled,
12065                      &parent_event->child_total_time_enabled);
12066         atomic64_add(child_event->total_time_running,
12067                      &parent_event->child_total_time_running);
12068 }
12069
12070 static void
12071 perf_event_exit_event(struct perf_event *child_event,
12072                       struct perf_event_context *child_ctx,
12073                       struct task_struct *child)
12074 {
12075         struct perf_event *parent_event = child_event->parent;
12076
12077         /*
12078          * Do not destroy the 'original' grouping; because of the context
12079          * switch optimization the original events could've ended up in a
12080          * random child task.
12081          *
12082          * If we were to destroy the original group, all group related
12083          * operations would cease to function properly after this random
12084          * child dies.
12085          *
12086          * Do destroy all inherited groups, we don't care about those
12087          * and being thorough is better.
12088          */
12089         raw_spin_lock_irq(&child_ctx->lock);
12090         WARN_ON_ONCE(child_ctx->is_active);
12091
12092         if (parent_event)
12093                 perf_group_detach(child_event);
12094         list_del_event(child_event, child_ctx);
12095         perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
12096         raw_spin_unlock_irq(&child_ctx->lock);
12097
12098         /*
12099          * Parent events are governed by their filedesc, retain them.
12100          */
12101         if (!parent_event) {
12102                 perf_event_wakeup(child_event);
12103                 return;
12104         }
12105         /*
12106          * Child events can be cleaned up.
12107          */
12108
12109         sync_child_event(child_event, child);
12110
12111         /*
12112          * Remove this event from the parent's list
12113          */
12114         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
12115         mutex_lock(&parent_event->child_mutex);
12116         list_del_init(&child_event->child_list);
12117         mutex_unlock(&parent_event->child_mutex);
12118
12119         /*
12120          * Kick perf_poll() for is_event_hup().
12121          */
12122         perf_event_wakeup(parent_event);
12123         free_event(child_event);
12124         put_event(parent_event);
12125 }
12126
12127 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
12128 {
12129         struct perf_event_context *child_ctx, *clone_ctx = NULL;
12130         struct perf_event *child_event, *next;
12131
12132         WARN_ON_ONCE(child != current);
12133
12134         child_ctx = perf_pin_task_context(child, ctxn);
12135         if (!child_ctx)
12136                 return;
12137
12138         /*
12139          * In order to reduce the amount of tricky in ctx tear-down, we hold
12140          * ctx::mutex over the entire thing. This serializes against almost
12141          * everything that wants to access the ctx.
12142          *
12143          * The exception is sys_perf_event_open() /
12144          * perf_event_create_kernel_count() which does find_get_context()
12145          * without ctx::mutex (it cannot because of the move_group double mutex
12146          * lock thing). See the comments in perf_install_in_context().
12147          */
12148         mutex_lock(&child_ctx->mutex);
12149
12150         /*
12151          * In a single ctx::lock section, de-schedule the events and detach the
12152          * context from the task such that we cannot ever get it scheduled back
12153          * in.
12154          */
12155         raw_spin_lock_irq(&child_ctx->lock);
12156         task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
12157
12158         /*
12159          * Now that the context is inactive, destroy the task <-> ctx relation
12160          * and mark the context dead.
12161          */
12162         RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
12163         put_ctx(child_ctx); /* cannot be last */
12164         WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
12165         put_task_struct(current); /* cannot be last */
12166
12167         clone_ctx = unclone_ctx(child_ctx);
12168         raw_spin_unlock_irq(&child_ctx->lock);
12169
12170         if (clone_ctx)
12171                 put_ctx(clone_ctx);
12172
12173         /*
12174          * Report the task dead after unscheduling the events so that we
12175          * won't get any samples after PERF_RECORD_EXIT. We can however still
12176          * get a few PERF_RECORD_READ events.
12177          */
12178         perf_event_task(child, child_ctx, 0);
12179
12180         list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
12181                 perf_event_exit_event(child_event, child_ctx, child);
12182
12183         mutex_unlock(&child_ctx->mutex);
12184
12185         put_ctx(child_ctx);
12186 }
12187
12188 /*
12189  * When a child task exits, feed back event values to parent events.
12190  *
12191  * Can be called with exec_update_mutex held when called from
12192  * install_exec_creds().
12193  */
12194 void perf_event_exit_task(struct task_struct *child)
12195 {
12196         struct perf_event *event, *tmp;
12197         int ctxn;
12198
12199         mutex_lock(&child->perf_event_mutex);
12200         list_for_each_entry_safe(event, tmp, &child->perf_event_list,
12201                                  owner_entry) {
12202                 list_del_init(&event->owner_entry);
12203
12204                 /*
12205                  * Ensure the list deletion is visible before we clear
12206                  * the owner, closes a race against perf_release() where
12207                  * we need to serialize on the owner->perf_event_mutex.
12208                  */
12209                 smp_store_release(&event->owner, NULL);
12210         }
12211         mutex_unlock(&child->perf_event_mutex);
12212
12213         for_each_task_context_nr(ctxn)
12214                 perf_event_exit_task_context(child, ctxn);
12215
12216         /*
12217          * The perf_event_exit_task_context calls perf_event_task
12218          * with child's task_ctx, which generates EXIT events for
12219          * child contexts and sets child->perf_event_ctxp[] to NULL.
12220          * At this point we need to send EXIT events to cpu contexts.
12221          */
12222         perf_event_task(child, NULL, 0);
12223 }
12224
12225 static void perf_free_event(struct perf_event *event,
12226                             struct perf_event_context *ctx)
12227 {
12228         struct perf_event *parent = event->parent;
12229
12230         if (WARN_ON_ONCE(!parent))
12231                 return;
12232
12233         mutex_lock(&parent->child_mutex);
12234         list_del_init(&event->child_list);
12235         mutex_unlock(&parent->child_mutex);
12236
12237         put_event(parent);
12238
12239         raw_spin_lock_irq(&ctx->lock);
12240         perf_group_detach(event);
12241         list_del_event(event, ctx);
12242         raw_spin_unlock_irq(&ctx->lock);
12243         free_event(event);
12244 }
12245
12246 /*
12247  * Free a context as created by inheritance by perf_event_init_task() below,
12248  * used by fork() in case of fail.
12249  *
12250  * Even though the task has never lived, the context and events have been
12251  * exposed through the child_list, so we must take care tearing it all down.
12252  */
12253 void perf_event_free_task(struct task_struct *task)
12254 {
12255         struct perf_event_context *ctx;
12256         struct perf_event *event, *tmp;
12257         int ctxn;
12258
12259         for_each_task_context_nr(ctxn) {
12260                 ctx = task->perf_event_ctxp[ctxn];
12261                 if (!ctx)
12262                         continue;
12263
12264                 mutex_lock(&ctx->mutex);
12265                 raw_spin_lock_irq(&ctx->lock);
12266                 /*
12267                  * Destroy the task <-> ctx relation and mark the context dead.
12268                  *
12269                  * This is important because even though the task hasn't been
12270                  * exposed yet the context has been (through child_list).
12271                  */
12272                 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
12273                 WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
12274                 put_task_struct(task); /* cannot be last */
12275                 raw_spin_unlock_irq(&ctx->lock);
12276
12277                 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
12278                         perf_free_event(event, ctx);
12279
12280                 mutex_unlock(&ctx->mutex);
12281
12282                 /*
12283                  * perf_event_release_kernel() could've stolen some of our
12284                  * child events and still have them on its free_list. In that
12285                  * case we must wait for these events to have been freed (in
12286                  * particular all their references to this task must've been
12287                  * dropped).
12288                  *
12289                  * Without this copy_process() will unconditionally free this
12290                  * task (irrespective of its reference count) and
12291                  * _free_event()'s put_task_struct(event->hw.target) will be a
12292                  * use-after-free.
12293                  *
12294                  * Wait for all events to drop their context reference.
12295                  */
12296                 wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
12297                 put_ctx(ctx); /* must be last */
12298         }
12299 }
12300
12301 void perf_event_delayed_put(struct task_struct *task)
12302 {
12303         int ctxn;
12304
12305         for_each_task_context_nr(ctxn)
12306                 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
12307 }
12308
12309 struct file *perf_event_get(unsigned int fd)
12310 {
12311         struct file *file = fget(fd);
12312         if (!file)
12313                 return ERR_PTR(-EBADF);
12314
12315         if (file->f_op != &perf_fops) {
12316                 fput(file);
12317                 return ERR_PTR(-EBADF);
12318         }
12319
12320         return file;
12321 }
12322
12323 const struct perf_event *perf_get_event(struct file *file)
12324 {
12325         if (file->f_op != &perf_fops)
12326                 return ERR_PTR(-EINVAL);
12327
12328         return file->private_data;
12329 }
12330
12331 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
12332 {
12333         if (!event)
12334                 return ERR_PTR(-EINVAL);
12335
12336         return &event->attr;
12337 }
12338
12339 /*
12340  * Inherit an event from parent task to child task.
12341  *
12342  * Returns:
12343  *  - valid pointer on success
12344  *  - NULL for orphaned events
12345  *  - IS_ERR() on error
12346  */
12347 static struct perf_event *
12348 inherit_event(struct perf_event *parent_event,
12349               struct task_struct *parent,
12350               struct perf_event_context *parent_ctx,
12351               struct task_struct *child,
12352               struct perf_event *group_leader,
12353               struct perf_event_context *child_ctx)
12354 {
12355         enum perf_event_state parent_state = parent_event->state;
12356         struct perf_event *child_event;
12357         unsigned long flags;
12358
12359         /*
12360          * Instead of creating recursive hierarchies of events,
12361          * we link inherited events back to the original parent,
12362          * which has a filp for sure, which we use as the reference
12363          * count:
12364          */
12365         if (parent_event->parent)
12366                 parent_event = parent_event->parent;
12367
12368         child_event = perf_event_alloc(&parent_event->attr,
12369                                            parent_event->cpu,
12370                                            child,
12371                                            group_leader, parent_event,
12372                                            NULL, NULL, -1);
12373         if (IS_ERR(child_event))
12374                 return child_event;
12375
12376
12377         if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
12378             !child_ctx->task_ctx_data) {
12379                 struct pmu *pmu = child_event->pmu;
12380
12381                 child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
12382                                                    GFP_KERNEL);
12383                 if (!child_ctx->task_ctx_data) {
12384                         free_event(child_event);
12385                         return ERR_PTR(-ENOMEM);
12386                 }
12387         }
12388
12389         /*
12390          * is_orphaned_event() and list_add_tail(&parent_event->child_list)
12391          * must be under the same lock in order to serialize against
12392          * perf_event_release_kernel(), such that either we must observe
12393          * is_orphaned_event() or they will observe us on the child_list.
12394          */
12395         mutex_lock(&parent_event->child_mutex);
12396         if (is_orphaned_event(parent_event) ||
12397             !atomic_long_inc_not_zero(&parent_event->refcount)) {
12398                 mutex_unlock(&parent_event->child_mutex);
12399                 /* task_ctx_data is freed with child_ctx */
12400                 free_event(child_event);
12401                 return NULL;
12402         }
12403
12404         get_ctx(child_ctx);
12405
12406         /*
12407          * Make the child state follow the state of the parent event,
12408          * not its attr.disabled bit.  We hold the parent's mutex,
12409          * so we won't race with perf_event_{en, dis}able_family.
12410          */
12411         if (parent_state >= PERF_EVENT_STATE_INACTIVE)
12412                 child_event->state = PERF_EVENT_STATE_INACTIVE;
12413         else
12414                 child_event->state = PERF_EVENT_STATE_OFF;
12415
12416         if (parent_event->attr.freq) {
12417                 u64 sample_period = parent_event->hw.sample_period;
12418                 struct hw_perf_event *hwc = &child_event->hw;
12419
12420                 hwc->sample_period = sample_period;
12421                 hwc->last_period   = sample_period;
12422
12423                 local64_set(&hwc->period_left, sample_period);
12424         }
12425
12426         child_event->ctx = child_ctx;
12427         child_event->overflow_handler = parent_event->overflow_handler;
12428         child_event->overflow_handler_context
12429                 = parent_event->overflow_handler_context;
12430
12431         /*
12432          * Precalculate sample_data sizes
12433          */
12434         perf_event__header_size(child_event);
12435         perf_event__id_header_size(child_event);
12436
12437         /*
12438          * Link it up in the child's context:
12439          */
12440         raw_spin_lock_irqsave(&child_ctx->lock, flags);
12441         add_event_to_ctx(child_event, child_ctx);
12442         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
12443
12444         /*
12445          * Link this into the parent event's child list
12446          */
12447         list_add_tail(&child_event->child_list, &parent_event->child_list);
12448         mutex_unlock(&parent_event->child_mutex);
12449
12450         return child_event;
12451 }
12452
12453 /*
12454  * Inherits an event group.
12455  *
12456  * This will quietly suppress orphaned events; !inherit_event() is not an error.
12457  * This matches with perf_event_release_kernel() removing all child events.
12458  *
12459  * Returns:
12460  *  - 0 on success
12461  *  - <0 on error
12462  */
12463 static int inherit_group(struct perf_event *parent_event,
12464               struct task_struct *parent,
12465               struct perf_event_context *parent_ctx,
12466               struct task_struct *child,
12467               struct perf_event_context *child_ctx)
12468 {
12469         struct perf_event *leader;
12470         struct perf_event *sub;
12471         struct perf_event *child_ctr;
12472
12473         leader = inherit_event(parent_event, parent, parent_ctx,
12474                                  child, NULL, child_ctx);
12475         if (IS_ERR(leader))
12476                 return PTR_ERR(leader);
12477         /*
12478          * @leader can be NULL here because of is_orphaned_event(). In this
12479          * case inherit_event() will create individual events, similar to what
12480          * perf_group_detach() would do anyway.
12481          */
12482         for_each_sibling_event(sub, parent_event) {
12483                 child_ctr = inherit_event(sub, parent, parent_ctx,
12484                                             child, leader, child_ctx);
12485                 if (IS_ERR(child_ctr))
12486                         return PTR_ERR(child_ctr);
12487
12488                 if (sub->aux_event == parent_event && child_ctr &&
12489                     !perf_get_aux_event(child_ctr, leader))
12490                         return -EINVAL;
12491         }
12492         return 0;
12493 }
12494
12495 /*
12496  * Creates the child task context and tries to inherit the event-group.
12497  *
12498  * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
12499  * inherited_all set when we 'fail' to inherit an orphaned event; this is
12500  * consistent with perf_event_release_kernel() removing all child events.
12501  *
12502  * Returns:
12503  *  - 0 on success
12504  *  - <0 on error
12505  */
12506 static int
12507 inherit_task_group(struct perf_event *event, struct task_struct *parent,
12508                    struct perf_event_context *parent_ctx,
12509                    struct task_struct *child, int ctxn,
12510                    int *inherited_all)
12511 {
12512         int ret;
12513         struct perf_event_context *child_ctx;
12514
12515         if (!event->attr.inherit) {
12516                 *inherited_all = 0;
12517                 return 0;
12518         }
12519
12520         child_ctx = child->perf_event_ctxp[ctxn];
12521         if (!child_ctx) {
12522                 /*
12523                  * This is executed from the parent task context, so
12524                  * inherit events that have been marked for cloning.
12525                  * First allocate and initialize a context for the
12526                  * child.
12527                  */
12528                 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
12529                 if (!child_ctx)
12530                         return -ENOMEM;
12531
12532                 child->perf_event_ctxp[ctxn] = child_ctx;
12533         }
12534
12535         ret = inherit_group(event, parent, parent_ctx,
12536                             child, child_ctx);
12537
12538         if (ret)
12539                 *inherited_all = 0;
12540
12541         return ret;
12542 }
12543
12544 /*
12545  * Initialize the perf_event context in task_struct
12546  */
12547 static int perf_event_init_context(struct task_struct *child, int ctxn)
12548 {
12549         struct perf_event_context *child_ctx, *parent_ctx;
12550         struct perf_event_context *cloned_ctx;
12551         struct perf_event *event;
12552         struct task_struct *parent = current;
12553         int inherited_all = 1;
12554         unsigned long flags;
12555         int ret = 0;
12556
12557         if (likely(!parent->perf_event_ctxp[ctxn]))
12558                 return 0;
12559
12560         /*
12561          * If the parent's context is a clone, pin it so it won't get
12562          * swapped under us.
12563          */
12564         parent_ctx = perf_pin_task_context(parent, ctxn);
12565         if (!parent_ctx)
12566                 return 0;
12567
12568         /*
12569          * No need to check if parent_ctx != NULL here; since we saw
12570          * it non-NULL earlier, the only reason for it to become NULL
12571          * is if we exit, and since we're currently in the middle of
12572          * a fork we can't be exiting at the same time.
12573          */
12574
12575         /*
12576          * Lock the parent list. No need to lock the child - not PID
12577          * hashed yet and not running, so nobody can access it.
12578          */
12579         mutex_lock(&parent_ctx->mutex);
12580
12581         /*
12582          * We dont have to disable NMIs - we are only looking at
12583          * the list, not manipulating it:
12584          */
12585         perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
12586                 ret = inherit_task_group(event, parent, parent_ctx,
12587                                          child, ctxn, &inherited_all);
12588                 if (ret)
12589                         goto out_unlock;
12590         }
12591
12592         /*
12593          * We can't hold ctx->lock when iterating the ->flexible_group list due
12594          * to allocations, but we need to prevent rotation because
12595          * rotate_ctx() will change the list from interrupt context.
12596          */
12597         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
12598         parent_ctx->rotate_disable = 1;
12599         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
12600
12601         perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
12602                 ret = inherit_task_group(event, parent, parent_ctx,
12603                                          child, ctxn, &inherited_all);
12604                 if (ret)
12605                         goto out_unlock;
12606         }
12607
12608         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
12609         parent_ctx->rotate_disable = 0;
12610
12611         child_ctx = child->perf_event_ctxp[ctxn];
12612
12613         if (child_ctx && inherited_all) {
12614                 /*
12615                  * Mark the child context as a clone of the parent
12616                  * context, or of whatever the parent is a clone of.
12617                  *
12618                  * Note that if the parent is a clone, the holding of
12619                  * parent_ctx->lock avoids it from being uncloned.
12620                  */
12621                 cloned_ctx = parent_ctx->parent_ctx;
12622                 if (cloned_ctx) {
12623                         child_ctx->parent_ctx = cloned_ctx;
12624                         child_ctx->parent_gen = parent_ctx->parent_gen;
12625                 } else {
12626                         child_ctx->parent_ctx = parent_ctx;
12627                         child_ctx->parent_gen = parent_ctx->generation;
12628                 }
12629                 get_ctx(child_ctx->parent_ctx);
12630         }
12631
12632         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
12633 out_unlock:
12634         mutex_unlock(&parent_ctx->mutex);
12635
12636         perf_unpin_context(parent_ctx);
12637         put_ctx(parent_ctx);
12638
12639         return ret;
12640 }
12641
12642 /*
12643  * Initialize the perf_event context in task_struct
12644  */
12645 int perf_event_init_task(struct task_struct *child)
12646 {
12647         int ctxn, ret;
12648
12649         memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
12650         mutex_init(&child->perf_event_mutex);
12651         INIT_LIST_HEAD(&child->perf_event_list);
12652
12653         for_each_task_context_nr(ctxn) {
12654                 ret = perf_event_init_context(child, ctxn);
12655                 if (ret) {
12656                         perf_event_free_task(child);
12657                         return ret;
12658                 }
12659         }
12660
12661         return 0;
12662 }
12663
12664 static void __init perf_event_init_all_cpus(void)
12665 {
12666         struct swevent_htable *swhash;
12667         int cpu;
12668
12669         zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
12670
12671         for_each_possible_cpu(cpu) {
12672                 swhash = &per_cpu(swevent_htable, cpu);
12673                 mutex_init(&swhash->hlist_mutex);
12674                 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
12675
12676                 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
12677                 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
12678
12679 #ifdef CONFIG_CGROUP_PERF
12680                 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
12681 #endif
12682                 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
12683         }
12684 }
12685
12686 static void perf_swevent_init_cpu(unsigned int cpu)
12687 {
12688         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
12689
12690         mutex_lock(&swhash->hlist_mutex);
12691         if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
12692                 struct swevent_hlist *hlist;
12693
12694                 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
12695                 WARN_ON(!hlist);
12696                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
12697         }
12698         mutex_unlock(&swhash->hlist_mutex);
12699 }
12700
12701 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
12702 static void __perf_event_exit_context(void *__info)
12703 {
12704         struct perf_event_context *ctx = __info;
12705         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
12706         struct perf_event *event;
12707
12708         raw_spin_lock(&ctx->lock);
12709         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
12710         list_for_each_entry(event, &ctx->event_list, event_entry)
12711                 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
12712         raw_spin_unlock(&ctx->lock);
12713 }
12714
12715 static void perf_event_exit_cpu_context(int cpu)
12716 {
12717         struct perf_cpu_context *cpuctx;
12718         struct perf_event_context *ctx;
12719         struct pmu *pmu;
12720
12721         mutex_lock(&pmus_lock);
12722         list_for_each_entry(pmu, &pmus, entry) {
12723                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
12724                 ctx = &cpuctx->ctx;
12725
12726                 mutex_lock(&ctx->mutex);
12727                 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
12728                 cpuctx->online = 0;
12729                 mutex_unlock(&ctx->mutex);
12730         }
12731         cpumask_clear_cpu(cpu, perf_online_mask);
12732         mutex_unlock(&pmus_lock);
12733 }
12734 #else
12735
12736 static void perf_event_exit_cpu_context(int cpu) { }
12737
12738 #endif
12739
12740 int perf_event_init_cpu(unsigned int cpu)
12741 {
12742         struct perf_cpu_context *cpuctx;
12743         struct perf_event_context *ctx;
12744         struct pmu *pmu;
12745
12746         perf_swevent_init_cpu(cpu);
12747
12748         mutex_lock(&pmus_lock);
12749         cpumask_set_cpu(cpu, perf_online_mask);
12750         list_for_each_entry(pmu, &pmus, entry) {
12751                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
12752                 ctx = &cpuctx->ctx;
12753
12754                 mutex_lock(&ctx->mutex);
12755                 cpuctx->online = 1;
12756                 mutex_unlock(&ctx->mutex);
12757         }
12758         mutex_unlock(&pmus_lock);
12759
12760         return 0;
12761 }
12762
12763 int perf_event_exit_cpu(unsigned int cpu)
12764 {
12765         perf_event_exit_cpu_context(cpu);
12766         return 0;
12767 }
12768
12769 static int
12770 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
12771 {
12772         int cpu;
12773
12774         for_each_online_cpu(cpu)
12775                 perf_event_exit_cpu(cpu);
12776
12777         return NOTIFY_OK;
12778 }
12779
12780 /*
12781  * Run the perf reboot notifier at the very last possible moment so that
12782  * the generic watchdog code runs as long as possible.
12783  */
12784 static struct notifier_block perf_reboot_notifier = {
12785         .notifier_call = perf_reboot,
12786         .priority = INT_MIN,
12787 };
12788
12789 void __init perf_event_init(void)
12790 {
12791         int ret;
12792
12793         idr_init(&pmu_idr);
12794
12795         perf_event_init_all_cpus();
12796         init_srcu_struct(&pmus_srcu);
12797         perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
12798         perf_pmu_register(&perf_cpu_clock, NULL, -1);
12799         perf_pmu_register(&perf_task_clock, NULL, -1);
12800         perf_tp_register();
12801         perf_event_init_cpu(smp_processor_id());
12802         register_reboot_notifier(&perf_reboot_notifier);
12803
12804         ret = init_hw_breakpoint();
12805         WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
12806
12807         /*
12808          * Build time assertion that we keep the data_head at the intended
12809          * location.  IOW, validation we got the __reserved[] size right.
12810          */
12811         BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
12812                      != 1024);
12813 }
12814
12815 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
12816                               char *page)
12817 {
12818         struct perf_pmu_events_attr *pmu_attr =
12819                 container_of(attr, struct perf_pmu_events_attr, attr);
12820
12821         if (pmu_attr->event_str)
12822                 return sprintf(page, "%s\n", pmu_attr->event_str);
12823
12824         return 0;
12825 }
12826 EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
12827
12828 static int __init perf_event_sysfs_init(void)
12829 {
12830         struct pmu *pmu;
12831         int ret;
12832
12833         mutex_lock(&pmus_lock);
12834
12835         ret = bus_register(&pmu_bus);
12836         if (ret)
12837                 goto unlock;
12838
12839         list_for_each_entry(pmu, &pmus, entry) {
12840                 if (!pmu->name || pmu->type < 0)
12841                         continue;
12842
12843                 ret = pmu_dev_alloc(pmu);
12844                 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
12845         }
12846         pmu_bus_running = 1;
12847         ret = 0;
12848
12849 unlock:
12850         mutex_unlock(&pmus_lock);
12851
12852         return ret;
12853 }
12854 device_initcall(perf_event_sysfs_init);
12855
12856 #ifdef CONFIG_CGROUP_PERF
12857 static struct cgroup_subsys_state *
12858 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
12859 {
12860         struct perf_cgroup *jc;
12861
12862         jc = kzalloc(sizeof(*jc), GFP_KERNEL);
12863         if (!jc)
12864                 return ERR_PTR(-ENOMEM);
12865
12866         jc->info = alloc_percpu(struct perf_cgroup_info);
12867         if (!jc->info) {
12868                 kfree(jc);
12869                 return ERR_PTR(-ENOMEM);
12870         }
12871
12872         return &jc->css;
12873 }
12874
12875 static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
12876 {
12877         struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
12878
12879         free_percpu(jc->info);
12880         kfree(jc);
12881 }
12882
12883 static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
12884 {
12885         perf_event_cgroup(css->cgroup);
12886         return 0;
12887 }
12888
12889 static int __perf_cgroup_move(void *info)
12890 {
12891         struct task_struct *task = info;
12892         rcu_read_lock();
12893         perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
12894         rcu_read_unlock();
12895         return 0;
12896 }
12897
12898 static void perf_cgroup_attach(struct cgroup_taskset *tset)
12899 {
12900         struct task_struct *task;
12901         struct cgroup_subsys_state *css;
12902
12903         cgroup_taskset_for_each(task, css, tset)
12904                 task_function_call(task, __perf_cgroup_move, task);
12905 }
12906
12907 struct cgroup_subsys perf_event_cgrp_subsys = {
12908         .css_alloc      = perf_cgroup_css_alloc,
12909         .css_free       = perf_cgroup_css_free,
12910         .css_online     = perf_cgroup_css_online,
12911         .attach         = perf_cgroup_attach,
12912         /*
12913          * Implicitly enable on dfl hierarchy so that perf events can
12914          * always be filtered by cgroup2 path as long as perf_event
12915          * controller is not mounted on a legacy hierarchy.
12916          */
12917         .implicit_on_dfl = true,
12918         .threaded       = true,
12919 };
12920 #endif /* CONFIG_CGROUP_PERF */