perf: Rework perf_event_exit_event()
[linux-2.6-microblaze.git] / kernel / events / core.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Performance events core code:
4  *
5  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
6  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
7  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
8  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
9  */
10
11 #include <linux/fs.h>
12 #include <linux/mm.h>
13 #include <linux/cpu.h>
14 #include <linux/smp.h>
15 #include <linux/idr.h>
16 #include <linux/file.h>
17 #include <linux/poll.h>
18 #include <linux/slab.h>
19 #include <linux/hash.h>
20 #include <linux/tick.h>
21 #include <linux/sysfs.h>
22 #include <linux/dcache.h>
23 #include <linux/percpu.h>
24 #include <linux/ptrace.h>
25 #include <linux/reboot.h>
26 #include <linux/vmstat.h>
27 #include <linux/device.h>
28 #include <linux/export.h>
29 #include <linux/vmalloc.h>
30 #include <linux/hardirq.h>
31 #include <linux/hugetlb.h>
32 #include <linux/rculist.h>
33 #include <linux/uaccess.h>
34 #include <linux/syscalls.h>
35 #include <linux/anon_inodes.h>
36 #include <linux/kernel_stat.h>
37 #include <linux/cgroup.h>
38 #include <linux/perf_event.h>
39 #include <linux/trace_events.h>
40 #include <linux/hw_breakpoint.h>
41 #include <linux/mm_types.h>
42 #include <linux/module.h>
43 #include <linux/mman.h>
44 #include <linux/compat.h>
45 #include <linux/bpf.h>
46 #include <linux/filter.h>
47 #include <linux/namei.h>
48 #include <linux/parser.h>
49 #include <linux/sched/clock.h>
50 #include <linux/sched/mm.h>
51 #include <linux/proc_ns.h>
52 #include <linux/mount.h>
53 #include <linux/min_heap.h>
54 #include <linux/highmem.h>
55 #include <linux/pgtable.h>
56 #include <linux/buildid.h>
57
58 #include "internal.h"
59
60 #include <asm/irq_regs.h>
61
62 typedef int (*remote_function_f)(void *);
63
64 struct remote_function_call {
65         struct task_struct      *p;
66         remote_function_f       func;
67         void                    *info;
68         int                     ret;
69 };
70
71 static void remote_function(void *data)
72 {
73         struct remote_function_call *tfc = data;
74         struct task_struct *p = tfc->p;
75
76         if (p) {
77                 /* -EAGAIN */
78                 if (task_cpu(p) != smp_processor_id())
79                         return;
80
81                 /*
82                  * Now that we're on right CPU with IRQs disabled, we can test
83                  * if we hit the right task without races.
84                  */
85
86                 tfc->ret = -ESRCH; /* No such (running) process */
87                 if (p != current)
88                         return;
89         }
90
91         tfc->ret = tfc->func(tfc->info);
92 }
93
94 /**
95  * task_function_call - call a function on the cpu on which a task runs
96  * @p:          the task to evaluate
97  * @func:       the function to be called
98  * @info:       the function call argument
99  *
100  * Calls the function @func when the task is currently running. This might
101  * be on the current CPU, which just calls the function directly.  This will
102  * retry due to any failures in smp_call_function_single(), such as if the
103  * task_cpu() goes offline concurrently.
104  *
105  * returns @func return value or -ESRCH or -ENXIO when the process isn't running
106  */
107 static int
108 task_function_call(struct task_struct *p, remote_function_f func, void *info)
109 {
110         struct remote_function_call data = {
111                 .p      = p,
112                 .func   = func,
113                 .info   = info,
114                 .ret    = -EAGAIN,
115         };
116         int ret;
117
118         for (;;) {
119                 ret = smp_call_function_single(task_cpu(p), remote_function,
120                                                &data, 1);
121                 if (!ret)
122                         ret = data.ret;
123
124                 if (ret != -EAGAIN)
125                         break;
126
127                 cond_resched();
128         }
129
130         return ret;
131 }
132
133 /**
134  * cpu_function_call - call a function on the cpu
135  * @func:       the function to be called
136  * @info:       the function call argument
137  *
138  * Calls the function @func on the remote cpu.
139  *
140  * returns: @func return value or -ENXIO when the cpu is offline
141  */
142 static int cpu_function_call(int cpu, remote_function_f func, void *info)
143 {
144         struct remote_function_call data = {
145                 .p      = NULL,
146                 .func   = func,
147                 .info   = info,
148                 .ret    = -ENXIO, /* No such CPU */
149         };
150
151         smp_call_function_single(cpu, remote_function, &data, 1);
152
153         return data.ret;
154 }
155
156 static inline struct perf_cpu_context *
157 __get_cpu_context(struct perf_event_context *ctx)
158 {
159         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
160 }
161
162 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
163                           struct perf_event_context *ctx)
164 {
165         raw_spin_lock(&cpuctx->ctx.lock);
166         if (ctx)
167                 raw_spin_lock(&ctx->lock);
168 }
169
170 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
171                             struct perf_event_context *ctx)
172 {
173         if (ctx)
174                 raw_spin_unlock(&ctx->lock);
175         raw_spin_unlock(&cpuctx->ctx.lock);
176 }
177
178 #define TASK_TOMBSTONE ((void *)-1L)
179
180 static bool is_kernel_event(struct perf_event *event)
181 {
182         return READ_ONCE(event->owner) == TASK_TOMBSTONE;
183 }
184
185 /*
186  * On task ctx scheduling...
187  *
188  * When !ctx->nr_events a task context will not be scheduled. This means
189  * we can disable the scheduler hooks (for performance) without leaving
190  * pending task ctx state.
191  *
192  * This however results in two special cases:
193  *
194  *  - removing the last event from a task ctx; this is relatively straight
195  *    forward and is done in __perf_remove_from_context.
196  *
197  *  - adding the first event to a task ctx; this is tricky because we cannot
198  *    rely on ctx->is_active and therefore cannot use event_function_call().
199  *    See perf_install_in_context().
200  *
201  * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
202  */
203
204 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
205                         struct perf_event_context *, void *);
206
207 struct event_function_struct {
208         struct perf_event *event;
209         event_f func;
210         void *data;
211 };
212
213 static int event_function(void *info)
214 {
215         struct event_function_struct *efs = info;
216         struct perf_event *event = efs->event;
217         struct perf_event_context *ctx = event->ctx;
218         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
219         struct perf_event_context *task_ctx = cpuctx->task_ctx;
220         int ret = 0;
221
222         lockdep_assert_irqs_disabled();
223
224         perf_ctx_lock(cpuctx, task_ctx);
225         /*
226          * Since we do the IPI call without holding ctx->lock things can have
227          * changed, double check we hit the task we set out to hit.
228          */
229         if (ctx->task) {
230                 if (ctx->task != current) {
231                         ret = -ESRCH;
232                         goto unlock;
233                 }
234
235                 /*
236                  * We only use event_function_call() on established contexts,
237                  * and event_function() is only ever called when active (or
238                  * rather, we'll have bailed in task_function_call() or the
239                  * above ctx->task != current test), therefore we must have
240                  * ctx->is_active here.
241                  */
242                 WARN_ON_ONCE(!ctx->is_active);
243                 /*
244                  * And since we have ctx->is_active, cpuctx->task_ctx must
245                  * match.
246                  */
247                 WARN_ON_ONCE(task_ctx != ctx);
248         } else {
249                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
250         }
251
252         efs->func(event, cpuctx, ctx, efs->data);
253 unlock:
254         perf_ctx_unlock(cpuctx, task_ctx);
255
256         return ret;
257 }
258
259 static void event_function_call(struct perf_event *event, event_f func, void *data)
260 {
261         struct perf_event_context *ctx = event->ctx;
262         struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
263         struct event_function_struct efs = {
264                 .event = event,
265                 .func = func,
266                 .data = data,
267         };
268
269         if (!event->parent) {
270                 /*
271                  * If this is a !child event, we must hold ctx::mutex to
272                  * stabilize the event->ctx relation. See
273                  * perf_event_ctx_lock().
274                  */
275                 lockdep_assert_held(&ctx->mutex);
276         }
277
278         if (!task) {
279                 cpu_function_call(event->cpu, event_function, &efs);
280                 return;
281         }
282
283         if (task == TASK_TOMBSTONE)
284                 return;
285
286 again:
287         if (!task_function_call(task, event_function, &efs))
288                 return;
289
290         raw_spin_lock_irq(&ctx->lock);
291         /*
292          * Reload the task pointer, it might have been changed by
293          * a concurrent perf_event_context_sched_out().
294          */
295         task = ctx->task;
296         if (task == TASK_TOMBSTONE) {
297                 raw_spin_unlock_irq(&ctx->lock);
298                 return;
299         }
300         if (ctx->is_active) {
301                 raw_spin_unlock_irq(&ctx->lock);
302                 goto again;
303         }
304         func(event, NULL, ctx, data);
305         raw_spin_unlock_irq(&ctx->lock);
306 }
307
308 /*
309  * Similar to event_function_call() + event_function(), but hard assumes IRQs
310  * are already disabled and we're on the right CPU.
311  */
312 static void event_function_local(struct perf_event *event, event_f func, void *data)
313 {
314         struct perf_event_context *ctx = event->ctx;
315         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
316         struct task_struct *task = READ_ONCE(ctx->task);
317         struct perf_event_context *task_ctx = NULL;
318
319         lockdep_assert_irqs_disabled();
320
321         if (task) {
322                 if (task == TASK_TOMBSTONE)
323                         return;
324
325                 task_ctx = ctx;
326         }
327
328         perf_ctx_lock(cpuctx, task_ctx);
329
330         task = ctx->task;
331         if (task == TASK_TOMBSTONE)
332                 goto unlock;
333
334         if (task) {
335                 /*
336                  * We must be either inactive or active and the right task,
337                  * otherwise we're screwed, since we cannot IPI to somewhere
338                  * else.
339                  */
340                 if (ctx->is_active) {
341                         if (WARN_ON_ONCE(task != current))
342                                 goto unlock;
343
344                         if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
345                                 goto unlock;
346                 }
347         } else {
348                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
349         }
350
351         func(event, cpuctx, ctx, data);
352 unlock:
353         perf_ctx_unlock(cpuctx, task_ctx);
354 }
355
356 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
357                        PERF_FLAG_FD_OUTPUT  |\
358                        PERF_FLAG_PID_CGROUP |\
359                        PERF_FLAG_FD_CLOEXEC)
360
361 /*
362  * branch priv levels that need permission checks
363  */
364 #define PERF_SAMPLE_BRANCH_PERM_PLM \
365         (PERF_SAMPLE_BRANCH_KERNEL |\
366          PERF_SAMPLE_BRANCH_HV)
367
368 enum event_type_t {
369         EVENT_FLEXIBLE = 0x1,
370         EVENT_PINNED = 0x2,
371         EVENT_TIME = 0x4,
372         /* see ctx_resched() for details */
373         EVENT_CPU = 0x8,
374         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
375 };
376
377 /*
378  * perf_sched_events : >0 events exist
379  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
380  */
381
382 static void perf_sched_delayed(struct work_struct *work);
383 DEFINE_STATIC_KEY_FALSE(perf_sched_events);
384 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
385 static DEFINE_MUTEX(perf_sched_mutex);
386 static atomic_t perf_sched_count;
387
388 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
389 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
390 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
391
392 static atomic_t nr_mmap_events __read_mostly;
393 static atomic_t nr_comm_events __read_mostly;
394 static atomic_t nr_namespaces_events __read_mostly;
395 static atomic_t nr_task_events __read_mostly;
396 static atomic_t nr_freq_events __read_mostly;
397 static atomic_t nr_switch_events __read_mostly;
398 static atomic_t nr_ksymbol_events __read_mostly;
399 static atomic_t nr_bpf_events __read_mostly;
400 static atomic_t nr_cgroup_events __read_mostly;
401 static atomic_t nr_text_poke_events __read_mostly;
402 static atomic_t nr_build_id_events __read_mostly;
403
404 static LIST_HEAD(pmus);
405 static DEFINE_MUTEX(pmus_lock);
406 static struct srcu_struct pmus_srcu;
407 static cpumask_var_t perf_online_mask;
408 static struct kmem_cache *perf_event_cache;
409
410 /*
411  * perf event paranoia level:
412  *  -1 - not paranoid at all
413  *   0 - disallow raw tracepoint access for unpriv
414  *   1 - disallow cpu events for unpriv
415  *   2 - disallow kernel profiling for unpriv
416  */
417 int sysctl_perf_event_paranoid __read_mostly = 2;
418
419 /* Minimum for 512 kiB + 1 user control page */
420 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
421
422 /*
423  * max perf event sample rate
424  */
425 #define DEFAULT_MAX_SAMPLE_RATE         100000
426 #define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
427 #define DEFAULT_CPU_TIME_MAX_PERCENT    25
428
429 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
430
431 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
432 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
433
434 static int perf_sample_allowed_ns __read_mostly =
435         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
436
437 static void update_perf_cpu_limits(void)
438 {
439         u64 tmp = perf_sample_period_ns;
440
441         tmp *= sysctl_perf_cpu_time_max_percent;
442         tmp = div_u64(tmp, 100);
443         if (!tmp)
444                 tmp = 1;
445
446         WRITE_ONCE(perf_sample_allowed_ns, tmp);
447 }
448
449 static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
450
451 int perf_proc_update_handler(struct ctl_table *table, int write,
452                 void *buffer, size_t *lenp, loff_t *ppos)
453 {
454         int ret;
455         int perf_cpu = sysctl_perf_cpu_time_max_percent;
456         /*
457          * If throttling is disabled don't allow the write:
458          */
459         if (write && (perf_cpu == 100 || perf_cpu == 0))
460                 return -EINVAL;
461
462         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
463         if (ret || !write)
464                 return ret;
465
466         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
467         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
468         update_perf_cpu_limits();
469
470         return 0;
471 }
472
473 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
474
475 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
476                 void *buffer, size_t *lenp, loff_t *ppos)
477 {
478         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
479
480         if (ret || !write)
481                 return ret;
482
483         if (sysctl_perf_cpu_time_max_percent == 100 ||
484             sysctl_perf_cpu_time_max_percent == 0) {
485                 printk(KERN_WARNING
486                        "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
487                 WRITE_ONCE(perf_sample_allowed_ns, 0);
488         } else {
489                 update_perf_cpu_limits();
490         }
491
492         return 0;
493 }
494
495 /*
496  * perf samples are done in some very critical code paths (NMIs).
497  * If they take too much CPU time, the system can lock up and not
498  * get any real work done.  This will drop the sample rate when
499  * we detect that events are taking too long.
500  */
501 #define NR_ACCUMULATED_SAMPLES 128
502 static DEFINE_PER_CPU(u64, running_sample_length);
503
504 static u64 __report_avg;
505 static u64 __report_allowed;
506
507 static void perf_duration_warn(struct irq_work *w)
508 {
509         printk_ratelimited(KERN_INFO
510                 "perf: interrupt took too long (%lld > %lld), lowering "
511                 "kernel.perf_event_max_sample_rate to %d\n",
512                 __report_avg, __report_allowed,
513                 sysctl_perf_event_sample_rate);
514 }
515
516 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
517
518 void perf_sample_event_took(u64 sample_len_ns)
519 {
520         u64 max_len = READ_ONCE(perf_sample_allowed_ns);
521         u64 running_len;
522         u64 avg_len;
523         u32 max;
524
525         if (max_len == 0)
526                 return;
527
528         /* Decay the counter by 1 average sample. */
529         running_len = __this_cpu_read(running_sample_length);
530         running_len -= running_len/NR_ACCUMULATED_SAMPLES;
531         running_len += sample_len_ns;
532         __this_cpu_write(running_sample_length, running_len);
533
534         /*
535          * Note: this will be biased artifically low until we have
536          * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
537          * from having to maintain a count.
538          */
539         avg_len = running_len/NR_ACCUMULATED_SAMPLES;
540         if (avg_len <= max_len)
541                 return;
542
543         __report_avg = avg_len;
544         __report_allowed = max_len;
545
546         /*
547          * Compute a throttle threshold 25% below the current duration.
548          */
549         avg_len += avg_len / 4;
550         max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
551         if (avg_len < max)
552                 max /= (u32)avg_len;
553         else
554                 max = 1;
555
556         WRITE_ONCE(perf_sample_allowed_ns, avg_len);
557         WRITE_ONCE(max_samples_per_tick, max);
558
559         sysctl_perf_event_sample_rate = max * HZ;
560         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
561
562         if (!irq_work_queue(&perf_duration_work)) {
563                 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
564                              "kernel.perf_event_max_sample_rate to %d\n",
565                              __report_avg, __report_allowed,
566                              sysctl_perf_event_sample_rate);
567         }
568 }
569
570 static atomic64_t perf_event_id;
571
572 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
573                               enum event_type_t event_type);
574
575 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
576                              enum event_type_t event_type,
577                              struct task_struct *task);
578
579 static void update_context_time(struct perf_event_context *ctx);
580 static u64 perf_event_time(struct perf_event *event);
581
582 void __weak perf_event_print_debug(void)        { }
583
584 extern __weak const char *perf_pmu_name(void)
585 {
586         return "pmu";
587 }
588
589 static inline u64 perf_clock(void)
590 {
591         return local_clock();
592 }
593
594 static inline u64 perf_event_clock(struct perf_event *event)
595 {
596         return event->clock();
597 }
598
599 /*
600  * State based event timekeeping...
601  *
602  * The basic idea is to use event->state to determine which (if any) time
603  * fields to increment with the current delta. This means we only need to
604  * update timestamps when we change state or when they are explicitly requested
605  * (read).
606  *
607  * Event groups make things a little more complicated, but not terribly so. The
608  * rules for a group are that if the group leader is OFF the entire group is
609  * OFF, irrespecive of what the group member states are. This results in
610  * __perf_effective_state().
611  *
612  * A futher ramification is that when a group leader flips between OFF and
613  * !OFF, we need to update all group member times.
614  *
615  *
616  * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
617  * need to make sure the relevant context time is updated before we try and
618  * update our timestamps.
619  */
620
621 static __always_inline enum perf_event_state
622 __perf_effective_state(struct perf_event *event)
623 {
624         struct perf_event *leader = event->group_leader;
625
626         if (leader->state <= PERF_EVENT_STATE_OFF)
627                 return leader->state;
628
629         return event->state;
630 }
631
632 static __always_inline void
633 __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
634 {
635         enum perf_event_state state = __perf_effective_state(event);
636         u64 delta = now - event->tstamp;
637
638         *enabled = event->total_time_enabled;
639         if (state >= PERF_EVENT_STATE_INACTIVE)
640                 *enabled += delta;
641
642         *running = event->total_time_running;
643         if (state >= PERF_EVENT_STATE_ACTIVE)
644                 *running += delta;
645 }
646
647 static void perf_event_update_time(struct perf_event *event)
648 {
649         u64 now = perf_event_time(event);
650
651         __perf_update_times(event, now, &event->total_time_enabled,
652                                         &event->total_time_running);
653         event->tstamp = now;
654 }
655
656 static void perf_event_update_sibling_time(struct perf_event *leader)
657 {
658         struct perf_event *sibling;
659
660         for_each_sibling_event(sibling, leader)
661                 perf_event_update_time(sibling);
662 }
663
664 static void
665 perf_event_set_state(struct perf_event *event, enum perf_event_state state)
666 {
667         if (event->state == state)
668                 return;
669
670         perf_event_update_time(event);
671         /*
672          * If a group leader gets enabled/disabled all its siblings
673          * are affected too.
674          */
675         if ((event->state < 0) ^ (state < 0))
676                 perf_event_update_sibling_time(event);
677
678         WRITE_ONCE(event->state, state);
679 }
680
681 #ifdef CONFIG_CGROUP_PERF
682
683 static inline bool
684 perf_cgroup_match(struct perf_event *event)
685 {
686         struct perf_event_context *ctx = event->ctx;
687         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
688
689         /* @event doesn't care about cgroup */
690         if (!event->cgrp)
691                 return true;
692
693         /* wants specific cgroup scope but @cpuctx isn't associated with any */
694         if (!cpuctx->cgrp)
695                 return false;
696
697         /*
698          * Cgroup scoping is recursive.  An event enabled for a cgroup is
699          * also enabled for all its descendant cgroups.  If @cpuctx's
700          * cgroup is a descendant of @event's (the test covers identity
701          * case), it's a match.
702          */
703         return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
704                                     event->cgrp->css.cgroup);
705 }
706
707 static inline void perf_detach_cgroup(struct perf_event *event)
708 {
709         css_put(&event->cgrp->css);
710         event->cgrp = NULL;
711 }
712
713 static inline int is_cgroup_event(struct perf_event *event)
714 {
715         return event->cgrp != NULL;
716 }
717
718 static inline u64 perf_cgroup_event_time(struct perf_event *event)
719 {
720         struct perf_cgroup_info *t;
721
722         t = per_cpu_ptr(event->cgrp->info, event->cpu);
723         return t->time;
724 }
725
726 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
727 {
728         struct perf_cgroup_info *info;
729         u64 now;
730
731         now = perf_clock();
732
733         info = this_cpu_ptr(cgrp->info);
734
735         info->time += now - info->timestamp;
736         info->timestamp = now;
737 }
738
739 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
740 {
741         struct perf_cgroup *cgrp = cpuctx->cgrp;
742         struct cgroup_subsys_state *css;
743
744         if (cgrp) {
745                 for (css = &cgrp->css; css; css = css->parent) {
746                         cgrp = container_of(css, struct perf_cgroup, css);
747                         __update_cgrp_time(cgrp);
748                 }
749         }
750 }
751
752 static inline void update_cgrp_time_from_event(struct perf_event *event)
753 {
754         struct perf_cgroup *cgrp;
755
756         /*
757          * ensure we access cgroup data only when needed and
758          * when we know the cgroup is pinned (css_get)
759          */
760         if (!is_cgroup_event(event))
761                 return;
762
763         cgrp = perf_cgroup_from_task(current, event->ctx);
764         /*
765          * Do not update time when cgroup is not active
766          */
767         if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
768                 __update_cgrp_time(event->cgrp);
769 }
770
771 static inline void
772 perf_cgroup_set_timestamp(struct task_struct *task,
773                           struct perf_event_context *ctx)
774 {
775         struct perf_cgroup *cgrp;
776         struct perf_cgroup_info *info;
777         struct cgroup_subsys_state *css;
778
779         /*
780          * ctx->lock held by caller
781          * ensure we do not access cgroup data
782          * unless we have the cgroup pinned (css_get)
783          */
784         if (!task || !ctx->nr_cgroups)
785                 return;
786
787         cgrp = perf_cgroup_from_task(task, ctx);
788
789         for (css = &cgrp->css; css; css = css->parent) {
790                 cgrp = container_of(css, struct perf_cgroup, css);
791                 info = this_cpu_ptr(cgrp->info);
792                 info->timestamp = ctx->timestamp;
793         }
794 }
795
796 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
797
798 #define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
799 #define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
800
801 /*
802  * reschedule events based on the cgroup constraint of task.
803  *
804  * mode SWOUT : schedule out everything
805  * mode SWIN : schedule in based on cgroup for next
806  */
807 static void perf_cgroup_switch(struct task_struct *task, int mode)
808 {
809         struct perf_cpu_context *cpuctx;
810         struct list_head *list;
811         unsigned long flags;
812
813         /*
814          * Disable interrupts and preemption to avoid this CPU's
815          * cgrp_cpuctx_entry to change under us.
816          */
817         local_irq_save(flags);
818
819         list = this_cpu_ptr(&cgrp_cpuctx_list);
820         list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
821                 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
822
823                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
824                 perf_pmu_disable(cpuctx->ctx.pmu);
825
826                 if (mode & PERF_CGROUP_SWOUT) {
827                         cpu_ctx_sched_out(cpuctx, EVENT_ALL);
828                         /*
829                          * must not be done before ctxswout due
830                          * to event_filter_match() in event_sched_out()
831                          */
832                         cpuctx->cgrp = NULL;
833                 }
834
835                 if (mode & PERF_CGROUP_SWIN) {
836                         WARN_ON_ONCE(cpuctx->cgrp);
837                         /*
838                          * set cgrp before ctxsw in to allow
839                          * event_filter_match() to not have to pass
840                          * task around
841                          * we pass the cpuctx->ctx to perf_cgroup_from_task()
842                          * because cgorup events are only per-cpu
843                          */
844                         cpuctx->cgrp = perf_cgroup_from_task(task,
845                                                              &cpuctx->ctx);
846                         cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
847                 }
848                 perf_pmu_enable(cpuctx->ctx.pmu);
849                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
850         }
851
852         local_irq_restore(flags);
853 }
854
855 static inline void perf_cgroup_sched_out(struct task_struct *task,
856                                          struct task_struct *next)
857 {
858         struct perf_cgroup *cgrp1;
859         struct perf_cgroup *cgrp2 = NULL;
860
861         rcu_read_lock();
862         /*
863          * we come here when we know perf_cgroup_events > 0
864          * we do not need to pass the ctx here because we know
865          * we are holding the rcu lock
866          */
867         cgrp1 = perf_cgroup_from_task(task, NULL);
868         cgrp2 = perf_cgroup_from_task(next, NULL);
869
870         /*
871          * only schedule out current cgroup events if we know
872          * that we are switching to a different cgroup. Otherwise,
873          * do no touch the cgroup events.
874          */
875         if (cgrp1 != cgrp2)
876                 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
877
878         rcu_read_unlock();
879 }
880
881 static inline void perf_cgroup_sched_in(struct task_struct *prev,
882                                         struct task_struct *task)
883 {
884         struct perf_cgroup *cgrp1;
885         struct perf_cgroup *cgrp2 = NULL;
886
887         rcu_read_lock();
888         /*
889          * we come here when we know perf_cgroup_events > 0
890          * we do not need to pass the ctx here because we know
891          * we are holding the rcu lock
892          */
893         cgrp1 = perf_cgroup_from_task(task, NULL);
894         cgrp2 = perf_cgroup_from_task(prev, NULL);
895
896         /*
897          * only need to schedule in cgroup events if we are changing
898          * cgroup during ctxsw. Cgroup events were not scheduled
899          * out of ctxsw out if that was not the case.
900          */
901         if (cgrp1 != cgrp2)
902                 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
903
904         rcu_read_unlock();
905 }
906
907 static int perf_cgroup_ensure_storage(struct perf_event *event,
908                                 struct cgroup_subsys_state *css)
909 {
910         struct perf_cpu_context *cpuctx;
911         struct perf_event **storage;
912         int cpu, heap_size, ret = 0;
913
914         /*
915          * Allow storage to have sufficent space for an iterator for each
916          * possibly nested cgroup plus an iterator for events with no cgroup.
917          */
918         for (heap_size = 1; css; css = css->parent)
919                 heap_size++;
920
921         for_each_possible_cpu(cpu) {
922                 cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
923                 if (heap_size <= cpuctx->heap_size)
924                         continue;
925
926                 storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
927                                        GFP_KERNEL, cpu_to_node(cpu));
928                 if (!storage) {
929                         ret = -ENOMEM;
930                         break;
931                 }
932
933                 raw_spin_lock_irq(&cpuctx->ctx.lock);
934                 if (cpuctx->heap_size < heap_size) {
935                         swap(cpuctx->heap, storage);
936                         if (storage == cpuctx->heap_default)
937                                 storage = NULL;
938                         cpuctx->heap_size = heap_size;
939                 }
940                 raw_spin_unlock_irq(&cpuctx->ctx.lock);
941
942                 kfree(storage);
943         }
944
945         return ret;
946 }
947
948 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
949                                       struct perf_event_attr *attr,
950                                       struct perf_event *group_leader)
951 {
952         struct perf_cgroup *cgrp;
953         struct cgroup_subsys_state *css;
954         struct fd f = fdget(fd);
955         int ret = 0;
956
957         if (!f.file)
958                 return -EBADF;
959
960         css = css_tryget_online_from_dir(f.file->f_path.dentry,
961                                          &perf_event_cgrp_subsys);
962         if (IS_ERR(css)) {
963                 ret = PTR_ERR(css);
964                 goto out;
965         }
966
967         ret = perf_cgroup_ensure_storage(event, css);
968         if (ret)
969                 goto out;
970
971         cgrp = container_of(css, struct perf_cgroup, css);
972         event->cgrp = cgrp;
973
974         /*
975          * all events in a group must monitor
976          * the same cgroup because a task belongs
977          * to only one perf cgroup at a time
978          */
979         if (group_leader && group_leader->cgrp != cgrp) {
980                 perf_detach_cgroup(event);
981                 ret = -EINVAL;
982         }
983 out:
984         fdput(f);
985         return ret;
986 }
987
988 static inline void
989 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
990 {
991         struct perf_cgroup_info *t;
992         t = per_cpu_ptr(event->cgrp->info, event->cpu);
993         event->shadow_ctx_time = now - t->timestamp;
994 }
995
996 static inline void
997 perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
998 {
999         struct perf_cpu_context *cpuctx;
1000
1001         if (!is_cgroup_event(event))
1002                 return;
1003
1004         /*
1005          * Because cgroup events are always per-cpu events,
1006          * @ctx == &cpuctx->ctx.
1007          */
1008         cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1009
1010         /*
1011          * Since setting cpuctx->cgrp is conditional on the current @cgrp
1012          * matching the event's cgroup, we must do this for every new event,
1013          * because if the first would mismatch, the second would not try again
1014          * and we would leave cpuctx->cgrp unset.
1015          */
1016         if (ctx->is_active && !cpuctx->cgrp) {
1017                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
1018
1019                 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
1020                         cpuctx->cgrp = cgrp;
1021         }
1022
1023         if (ctx->nr_cgroups++)
1024                 return;
1025
1026         list_add(&cpuctx->cgrp_cpuctx_entry,
1027                         per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
1028 }
1029
1030 static inline void
1031 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1032 {
1033         struct perf_cpu_context *cpuctx;
1034
1035         if (!is_cgroup_event(event))
1036                 return;
1037
1038         /*
1039          * Because cgroup events are always per-cpu events,
1040          * @ctx == &cpuctx->ctx.
1041          */
1042         cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
1043
1044         if (--ctx->nr_cgroups)
1045                 return;
1046
1047         if (ctx->is_active && cpuctx->cgrp)
1048                 cpuctx->cgrp = NULL;
1049
1050         list_del(&cpuctx->cgrp_cpuctx_entry);
1051 }
1052
1053 #else /* !CONFIG_CGROUP_PERF */
1054
1055 static inline bool
1056 perf_cgroup_match(struct perf_event *event)
1057 {
1058         return true;
1059 }
1060
1061 static inline void perf_detach_cgroup(struct perf_event *event)
1062 {}
1063
1064 static inline int is_cgroup_event(struct perf_event *event)
1065 {
1066         return 0;
1067 }
1068
1069 static inline void update_cgrp_time_from_event(struct perf_event *event)
1070 {
1071 }
1072
1073 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
1074 {
1075 }
1076
1077 static inline void perf_cgroup_sched_out(struct task_struct *task,
1078                                          struct task_struct *next)
1079 {
1080 }
1081
1082 static inline void perf_cgroup_sched_in(struct task_struct *prev,
1083                                         struct task_struct *task)
1084 {
1085 }
1086
1087 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1088                                       struct perf_event_attr *attr,
1089                                       struct perf_event *group_leader)
1090 {
1091         return -EINVAL;
1092 }
1093
1094 static inline void
1095 perf_cgroup_set_timestamp(struct task_struct *task,
1096                           struct perf_event_context *ctx)
1097 {
1098 }
1099
1100 static inline void
1101 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
1102 {
1103 }
1104
1105 static inline void
1106 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
1107 {
1108 }
1109
1110 static inline u64 perf_cgroup_event_time(struct perf_event *event)
1111 {
1112         return 0;
1113 }
1114
1115 static inline void
1116 perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
1117 {
1118 }
1119
1120 static inline void
1121 perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
1122 {
1123 }
1124 #endif
1125
1126 /*
1127  * set default to be dependent on timer tick just
1128  * like original code
1129  */
1130 #define PERF_CPU_HRTIMER (1000 / HZ)
1131 /*
1132  * function must be called with interrupts disabled
1133  */
1134 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1135 {
1136         struct perf_cpu_context *cpuctx;
1137         bool rotations;
1138
1139         lockdep_assert_irqs_disabled();
1140
1141         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1142         rotations = perf_rotate_context(cpuctx);
1143
1144         raw_spin_lock(&cpuctx->hrtimer_lock);
1145         if (rotations)
1146                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1147         else
1148                 cpuctx->hrtimer_active = 0;
1149         raw_spin_unlock(&cpuctx->hrtimer_lock);
1150
1151         return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1152 }
1153
1154 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1155 {
1156         struct hrtimer *timer = &cpuctx->hrtimer;
1157         struct pmu *pmu = cpuctx->ctx.pmu;
1158         u64 interval;
1159
1160         /* no multiplexing needed for SW PMU */
1161         if (pmu->task_ctx_nr == perf_sw_context)
1162                 return;
1163
1164         /*
1165          * check default is sane, if not set then force to
1166          * default interval (1/tick)
1167          */
1168         interval = pmu->hrtimer_interval_ms;
1169         if (interval < 1)
1170                 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1171
1172         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1173
1174         raw_spin_lock_init(&cpuctx->hrtimer_lock);
1175         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
1176         timer->function = perf_mux_hrtimer_handler;
1177 }
1178
1179 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1180 {
1181         struct hrtimer *timer = &cpuctx->hrtimer;
1182         struct pmu *pmu = cpuctx->ctx.pmu;
1183         unsigned long flags;
1184
1185         /* not for SW PMU */
1186         if (pmu->task_ctx_nr == perf_sw_context)
1187                 return 0;
1188
1189         raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1190         if (!cpuctx->hrtimer_active) {
1191                 cpuctx->hrtimer_active = 1;
1192                 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1193                 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
1194         }
1195         raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1196
1197         return 0;
1198 }
1199
1200 void perf_pmu_disable(struct pmu *pmu)
1201 {
1202         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1203         if (!(*count)++)
1204                 pmu->pmu_disable(pmu);
1205 }
1206
1207 void perf_pmu_enable(struct pmu *pmu)
1208 {
1209         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1210         if (!--(*count))
1211                 pmu->pmu_enable(pmu);
1212 }
1213
1214 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1215
1216 /*
1217  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
1218  * perf_event_task_tick() are fully serialized because they're strictly cpu
1219  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
1220  * disabled, while perf_event_task_tick is called from IRQ context.
1221  */
1222 static void perf_event_ctx_activate(struct perf_event_context *ctx)
1223 {
1224         struct list_head *head = this_cpu_ptr(&active_ctx_list);
1225
1226         lockdep_assert_irqs_disabled();
1227
1228         WARN_ON(!list_empty(&ctx->active_ctx_list));
1229
1230         list_add(&ctx->active_ctx_list, head);
1231 }
1232
1233 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1234 {
1235         lockdep_assert_irqs_disabled();
1236
1237         WARN_ON(list_empty(&ctx->active_ctx_list));
1238
1239         list_del_init(&ctx->active_ctx_list);
1240 }
1241
1242 static void get_ctx(struct perf_event_context *ctx)
1243 {
1244         refcount_inc(&ctx->refcount);
1245 }
1246
1247 static void *alloc_task_ctx_data(struct pmu *pmu)
1248 {
1249         if (pmu->task_ctx_cache)
1250                 return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
1251
1252         return NULL;
1253 }
1254
1255 static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
1256 {
1257         if (pmu->task_ctx_cache && task_ctx_data)
1258                 kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
1259 }
1260
1261 static void free_ctx(struct rcu_head *head)
1262 {
1263         struct perf_event_context *ctx;
1264
1265         ctx = container_of(head, struct perf_event_context, rcu_head);
1266         free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
1267         kfree(ctx);
1268 }
1269
1270 static void put_ctx(struct perf_event_context *ctx)
1271 {
1272         if (refcount_dec_and_test(&ctx->refcount)) {
1273                 if (ctx->parent_ctx)
1274                         put_ctx(ctx->parent_ctx);
1275                 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1276                         put_task_struct(ctx->task);
1277                 call_rcu(&ctx->rcu_head, free_ctx);
1278         }
1279 }
1280
1281 /*
1282  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1283  * perf_pmu_migrate_context() we need some magic.
1284  *
1285  * Those places that change perf_event::ctx will hold both
1286  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1287  *
1288  * Lock ordering is by mutex address. There are two other sites where
1289  * perf_event_context::mutex nests and those are:
1290  *
1291  *  - perf_event_exit_task_context()    [ child , 0 ]
1292  *      perf_event_exit_event()
1293  *        put_event()                   [ parent, 1 ]
1294  *
1295  *  - perf_event_init_context()         [ parent, 0 ]
1296  *      inherit_task_group()
1297  *        inherit_group()
1298  *          inherit_event()
1299  *            perf_event_alloc()
1300  *              perf_init_event()
1301  *                perf_try_init_event() [ child , 1 ]
1302  *
1303  * While it appears there is an obvious deadlock here -- the parent and child
1304  * nesting levels are inverted between the two. This is in fact safe because
1305  * life-time rules separate them. That is an exiting task cannot fork, and a
1306  * spawning task cannot (yet) exit.
1307  *
1308  * But remember that these are parent<->child context relations, and
1309  * migration does not affect children, therefore these two orderings should not
1310  * interact.
1311  *
1312  * The change in perf_event::ctx does not affect children (as claimed above)
1313  * because the sys_perf_event_open() case will install a new event and break
1314  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1315  * concerned with cpuctx and that doesn't have children.
1316  *
1317  * The places that change perf_event::ctx will issue:
1318  *
1319  *   perf_remove_from_context();
1320  *   synchronize_rcu();
1321  *   perf_install_in_context();
1322  *
1323  * to affect the change. The remove_from_context() + synchronize_rcu() should
1324  * quiesce the event, after which we can install it in the new location. This
1325  * means that only external vectors (perf_fops, prctl) can perturb the event
1326  * while in transit. Therefore all such accessors should also acquire
1327  * perf_event_context::mutex to serialize against this.
1328  *
1329  * However; because event->ctx can change while we're waiting to acquire
1330  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1331  * function.
1332  *
1333  * Lock order:
1334  *    exec_update_lock
1335  *      task_struct::perf_event_mutex
1336  *        perf_event_context::mutex
1337  *          perf_event::child_mutex;
1338  *            perf_event_context::lock
1339  *          perf_event::mmap_mutex
1340  *          mmap_lock
1341  *            perf_addr_filters_head::lock
1342  *
1343  *    cpu_hotplug_lock
1344  *      pmus_lock
1345  *        cpuctx->mutex / perf_event_context::mutex
1346  */
1347 static struct perf_event_context *
1348 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1349 {
1350         struct perf_event_context *ctx;
1351
1352 again:
1353         rcu_read_lock();
1354         ctx = READ_ONCE(event->ctx);
1355         if (!refcount_inc_not_zero(&ctx->refcount)) {
1356                 rcu_read_unlock();
1357                 goto again;
1358         }
1359         rcu_read_unlock();
1360
1361         mutex_lock_nested(&ctx->mutex, nesting);
1362         if (event->ctx != ctx) {
1363                 mutex_unlock(&ctx->mutex);
1364                 put_ctx(ctx);
1365                 goto again;
1366         }
1367
1368         return ctx;
1369 }
1370
1371 static inline struct perf_event_context *
1372 perf_event_ctx_lock(struct perf_event *event)
1373 {
1374         return perf_event_ctx_lock_nested(event, 0);
1375 }
1376
1377 static void perf_event_ctx_unlock(struct perf_event *event,
1378                                   struct perf_event_context *ctx)
1379 {
1380         mutex_unlock(&ctx->mutex);
1381         put_ctx(ctx);
1382 }
1383
1384 /*
1385  * This must be done under the ctx->lock, such as to serialize against
1386  * context_equiv(), therefore we cannot call put_ctx() since that might end up
1387  * calling scheduler related locks and ctx->lock nests inside those.
1388  */
1389 static __must_check struct perf_event_context *
1390 unclone_ctx(struct perf_event_context *ctx)
1391 {
1392         struct perf_event_context *parent_ctx = ctx->parent_ctx;
1393
1394         lockdep_assert_held(&ctx->lock);
1395
1396         if (parent_ctx)
1397                 ctx->parent_ctx = NULL;
1398         ctx->generation++;
1399
1400         return parent_ctx;
1401 }
1402
1403 static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1404                                 enum pid_type type)
1405 {
1406         u32 nr;
1407         /*
1408          * only top level events have the pid namespace they were created in
1409          */
1410         if (event->parent)
1411                 event = event->parent;
1412
1413         nr = __task_pid_nr_ns(p, type, event->ns);
1414         /* avoid -1 if it is idle thread or runs in another ns */
1415         if (!nr && !pid_alive(p))
1416                 nr = -1;
1417         return nr;
1418 }
1419
1420 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1421 {
1422         return perf_event_pid_type(event, p, PIDTYPE_TGID);
1423 }
1424
1425 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1426 {
1427         return perf_event_pid_type(event, p, PIDTYPE_PID);
1428 }
1429
1430 /*
1431  * If we inherit events we want to return the parent event id
1432  * to userspace.
1433  */
1434 static u64 primary_event_id(struct perf_event *event)
1435 {
1436         u64 id = event->id;
1437
1438         if (event->parent)
1439                 id = event->parent->id;
1440
1441         return id;
1442 }
1443
1444 /*
1445  * Get the perf_event_context for a task and lock it.
1446  *
1447  * This has to cope with the fact that until it is locked,
1448  * the context could get moved to another task.
1449  */
1450 static struct perf_event_context *
1451 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1452 {
1453         struct perf_event_context *ctx;
1454
1455 retry:
1456         /*
1457          * One of the few rules of preemptible RCU is that one cannot do
1458          * rcu_read_unlock() while holding a scheduler (or nested) lock when
1459          * part of the read side critical section was irqs-enabled -- see
1460          * rcu_read_unlock_special().
1461          *
1462          * Since ctx->lock nests under rq->lock we must ensure the entire read
1463          * side critical section has interrupts disabled.
1464          */
1465         local_irq_save(*flags);
1466         rcu_read_lock();
1467         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1468         if (ctx) {
1469                 /*
1470                  * If this context is a clone of another, it might
1471                  * get swapped for another underneath us by
1472                  * perf_event_task_sched_out, though the
1473                  * rcu_read_lock() protects us from any context
1474                  * getting freed.  Lock the context and check if it
1475                  * got swapped before we could get the lock, and retry
1476                  * if so.  If we locked the right context, then it
1477                  * can't get swapped on us any more.
1478                  */
1479                 raw_spin_lock(&ctx->lock);
1480                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1481                         raw_spin_unlock(&ctx->lock);
1482                         rcu_read_unlock();
1483                         local_irq_restore(*flags);
1484                         goto retry;
1485                 }
1486
1487                 if (ctx->task == TASK_TOMBSTONE ||
1488                     !refcount_inc_not_zero(&ctx->refcount)) {
1489                         raw_spin_unlock(&ctx->lock);
1490                         ctx = NULL;
1491                 } else {
1492                         WARN_ON_ONCE(ctx->task != task);
1493                 }
1494         }
1495         rcu_read_unlock();
1496         if (!ctx)
1497                 local_irq_restore(*flags);
1498         return ctx;
1499 }
1500
1501 /*
1502  * Get the context for a task and increment its pin_count so it
1503  * can't get swapped to another task.  This also increments its
1504  * reference count so that the context can't get freed.
1505  */
1506 static struct perf_event_context *
1507 perf_pin_task_context(struct task_struct *task, int ctxn)
1508 {
1509         struct perf_event_context *ctx;
1510         unsigned long flags;
1511
1512         ctx = perf_lock_task_context(task, ctxn, &flags);
1513         if (ctx) {
1514                 ++ctx->pin_count;
1515                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1516         }
1517         return ctx;
1518 }
1519
1520 static void perf_unpin_context(struct perf_event_context *ctx)
1521 {
1522         unsigned long flags;
1523
1524         raw_spin_lock_irqsave(&ctx->lock, flags);
1525         --ctx->pin_count;
1526         raw_spin_unlock_irqrestore(&ctx->lock, flags);
1527 }
1528
1529 /*
1530  * Update the record of the current time in a context.
1531  */
1532 static void update_context_time(struct perf_event_context *ctx)
1533 {
1534         u64 now = perf_clock();
1535
1536         ctx->time += now - ctx->timestamp;
1537         ctx->timestamp = now;
1538 }
1539
1540 static u64 perf_event_time(struct perf_event *event)
1541 {
1542         struct perf_event_context *ctx = event->ctx;
1543
1544         if (is_cgroup_event(event))
1545                 return perf_cgroup_event_time(event);
1546
1547         return ctx ? ctx->time : 0;
1548 }
1549
1550 static enum event_type_t get_event_type(struct perf_event *event)
1551 {
1552         struct perf_event_context *ctx = event->ctx;
1553         enum event_type_t event_type;
1554
1555         lockdep_assert_held(&ctx->lock);
1556
1557         /*
1558          * It's 'group type', really, because if our group leader is
1559          * pinned, so are we.
1560          */
1561         if (event->group_leader != event)
1562                 event = event->group_leader;
1563
1564         event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1565         if (!ctx->task)
1566                 event_type |= EVENT_CPU;
1567
1568         return event_type;
1569 }
1570
1571 /*
1572  * Helper function to initialize event group nodes.
1573  */
1574 static void init_event_group(struct perf_event *event)
1575 {
1576         RB_CLEAR_NODE(&event->group_node);
1577         event->group_index = 0;
1578 }
1579
1580 /*
1581  * Extract pinned or flexible groups from the context
1582  * based on event attrs bits.
1583  */
1584 static struct perf_event_groups *
1585 get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1586 {
1587         if (event->attr.pinned)
1588                 return &ctx->pinned_groups;
1589         else
1590                 return &ctx->flexible_groups;
1591 }
1592
1593 /*
1594  * Helper function to initializes perf_event_group trees.
1595  */
1596 static void perf_event_groups_init(struct perf_event_groups *groups)
1597 {
1598         groups->tree = RB_ROOT;
1599         groups->index = 0;
1600 }
1601
1602 static inline struct cgroup *event_cgroup(const struct perf_event *event)
1603 {
1604         struct cgroup *cgroup = NULL;
1605
1606 #ifdef CONFIG_CGROUP_PERF
1607         if (event->cgrp)
1608                 cgroup = event->cgrp->css.cgroup;
1609 #endif
1610
1611         return cgroup;
1612 }
1613
1614 /*
1615  * Compare function for event groups;
1616  *
1617  * Implements complex key that first sorts by CPU and then by virtual index
1618  * which provides ordering when rotating groups for the same CPU.
1619  */
1620 static __always_inline int
1621 perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
1622                       const u64 left_group_index, const struct perf_event *right)
1623 {
1624         if (left_cpu < right->cpu)
1625                 return -1;
1626         if (left_cpu > right->cpu)
1627                 return 1;
1628
1629 #ifdef CONFIG_CGROUP_PERF
1630         {
1631                 const struct cgroup *right_cgroup = event_cgroup(right);
1632
1633                 if (left_cgroup != right_cgroup) {
1634                         if (!left_cgroup) {
1635                                 /*
1636                                  * Left has no cgroup but right does, no
1637                                  * cgroups come first.
1638                                  */
1639                                 return -1;
1640                         }
1641                         if (!right_cgroup) {
1642                                 /*
1643                                  * Right has no cgroup but left does, no
1644                                  * cgroups come first.
1645                                  */
1646                                 return 1;
1647                         }
1648                         /* Two dissimilar cgroups, order by id. */
1649                         if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
1650                                 return -1;
1651
1652                         return 1;
1653                 }
1654         }
1655 #endif
1656
1657         if (left_group_index < right->group_index)
1658                 return -1;
1659         if (left_group_index > right->group_index)
1660                 return 1;
1661
1662         return 0;
1663 }
1664
1665 #define __node_2_pe(node) \
1666         rb_entry((node), struct perf_event, group_node)
1667
1668 static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
1669 {
1670         struct perf_event *e = __node_2_pe(a);
1671         return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
1672                                      __node_2_pe(b)) < 0;
1673 }
1674
1675 struct __group_key {
1676         int cpu;
1677         struct cgroup *cgroup;
1678 };
1679
1680 static inline int __group_cmp(const void *key, const struct rb_node *node)
1681 {
1682         const struct __group_key *a = key;
1683         const struct perf_event *b = __node_2_pe(node);
1684
1685         /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
1686         return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
1687 }
1688
1689 /*
1690  * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
1691  * key (see perf_event_groups_less). This places it last inside the CPU
1692  * subtree.
1693  */
1694 static void
1695 perf_event_groups_insert(struct perf_event_groups *groups,
1696                          struct perf_event *event)
1697 {
1698         event->group_index = ++groups->index;
1699
1700         rb_add(&event->group_node, &groups->tree, __group_less);
1701 }
1702
1703 /*
1704  * Helper function to insert event into the pinned or flexible groups.
1705  */
1706 static void
1707 add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1708 {
1709         struct perf_event_groups *groups;
1710
1711         groups = get_event_groups(event, ctx);
1712         perf_event_groups_insert(groups, event);
1713 }
1714
1715 /*
1716  * Delete a group from a tree.
1717  */
1718 static void
1719 perf_event_groups_delete(struct perf_event_groups *groups,
1720                          struct perf_event *event)
1721 {
1722         WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1723                      RB_EMPTY_ROOT(&groups->tree));
1724
1725         rb_erase(&event->group_node, &groups->tree);
1726         init_event_group(event);
1727 }
1728
1729 /*
1730  * Helper function to delete event from its groups.
1731  */
1732 static void
1733 del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1734 {
1735         struct perf_event_groups *groups;
1736
1737         groups = get_event_groups(event, ctx);
1738         perf_event_groups_delete(groups, event);
1739 }
1740
1741 /*
1742  * Get the leftmost event in the cpu/cgroup subtree.
1743  */
1744 static struct perf_event *
1745 perf_event_groups_first(struct perf_event_groups *groups, int cpu,
1746                         struct cgroup *cgrp)
1747 {
1748         struct __group_key key = {
1749                 .cpu = cpu,
1750                 .cgroup = cgrp,
1751         };
1752         struct rb_node *node;
1753
1754         node = rb_find_first(&key, &groups->tree, __group_cmp);
1755         if (node)
1756                 return __node_2_pe(node);
1757
1758         return NULL;
1759 }
1760
1761 /*
1762  * Like rb_entry_next_safe() for the @cpu subtree.
1763  */
1764 static struct perf_event *
1765 perf_event_groups_next(struct perf_event *event)
1766 {
1767         struct __group_key key = {
1768                 .cpu = event->cpu,
1769                 .cgroup = event_cgroup(event),
1770         };
1771         struct rb_node *next;
1772
1773         next = rb_next_match(&key, &event->group_node, __group_cmp);
1774         if (next)
1775                 return __node_2_pe(next);
1776
1777         return NULL;
1778 }
1779
1780 /*
1781  * Iterate through the whole groups tree.
1782  */
1783 #define perf_event_groups_for_each(event, groups)                       \
1784         for (event = rb_entry_safe(rb_first(&((groups)->tree)),         \
1785                                 typeof(*event), group_node); event;     \
1786                 event = rb_entry_safe(rb_next(&event->group_node),      \
1787                                 typeof(*event), group_node))
1788
1789 /*
1790  * Add an event from the lists for its context.
1791  * Must be called with ctx->mutex and ctx->lock held.
1792  */
1793 static void
1794 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1795 {
1796         lockdep_assert_held(&ctx->lock);
1797
1798         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1799         event->attach_state |= PERF_ATTACH_CONTEXT;
1800
1801         event->tstamp = perf_event_time(event);
1802
1803         /*
1804          * If we're a stand alone event or group leader, we go to the context
1805          * list, group events are kept attached to the group so that
1806          * perf_group_detach can, at all times, locate all siblings.
1807          */
1808         if (event->group_leader == event) {
1809                 event->group_caps = event->event_caps;
1810                 add_event_to_groups(event, ctx);
1811         }
1812
1813         list_add_rcu(&event->event_entry, &ctx->event_list);
1814         ctx->nr_events++;
1815         if (event->attr.inherit_stat)
1816                 ctx->nr_stat++;
1817
1818         if (event->state > PERF_EVENT_STATE_OFF)
1819                 perf_cgroup_event_enable(event, ctx);
1820
1821         ctx->generation++;
1822 }
1823
1824 /*
1825  * Initialize event state based on the perf_event_attr::disabled.
1826  */
1827 static inline void perf_event__state_init(struct perf_event *event)
1828 {
1829         event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1830                                               PERF_EVENT_STATE_INACTIVE;
1831 }
1832
1833 static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1834 {
1835         int entry = sizeof(u64); /* value */
1836         int size = 0;
1837         int nr = 1;
1838
1839         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1840                 size += sizeof(u64);
1841
1842         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1843                 size += sizeof(u64);
1844
1845         if (event->attr.read_format & PERF_FORMAT_ID)
1846                 entry += sizeof(u64);
1847
1848         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1849                 nr += nr_siblings;
1850                 size += sizeof(u64);
1851         }
1852
1853         size += entry * nr;
1854         event->read_size = size;
1855 }
1856
1857 static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1858 {
1859         struct perf_sample_data *data;
1860         u16 size = 0;
1861
1862         if (sample_type & PERF_SAMPLE_IP)
1863                 size += sizeof(data->ip);
1864
1865         if (sample_type & PERF_SAMPLE_ADDR)
1866                 size += sizeof(data->addr);
1867
1868         if (sample_type & PERF_SAMPLE_PERIOD)
1869                 size += sizeof(data->period);
1870
1871         if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
1872                 size += sizeof(data->weight.full);
1873
1874         if (sample_type & PERF_SAMPLE_READ)
1875                 size += event->read_size;
1876
1877         if (sample_type & PERF_SAMPLE_DATA_SRC)
1878                 size += sizeof(data->data_src.val);
1879
1880         if (sample_type & PERF_SAMPLE_TRANSACTION)
1881                 size += sizeof(data->txn);
1882
1883         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1884                 size += sizeof(data->phys_addr);
1885
1886         if (sample_type & PERF_SAMPLE_CGROUP)
1887                 size += sizeof(data->cgroup);
1888
1889         if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
1890                 size += sizeof(data->data_page_size);
1891
1892         if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
1893                 size += sizeof(data->code_page_size);
1894
1895         event->header_size = size;
1896 }
1897
1898 /*
1899  * Called at perf_event creation and when events are attached/detached from a
1900  * group.
1901  */
1902 static void perf_event__header_size(struct perf_event *event)
1903 {
1904         __perf_event_read_size(event,
1905                                event->group_leader->nr_siblings);
1906         __perf_event_header_size(event, event->attr.sample_type);
1907 }
1908
1909 static void perf_event__id_header_size(struct perf_event *event)
1910 {
1911         struct perf_sample_data *data;
1912         u64 sample_type = event->attr.sample_type;
1913         u16 size = 0;
1914
1915         if (sample_type & PERF_SAMPLE_TID)
1916                 size += sizeof(data->tid_entry);
1917
1918         if (sample_type & PERF_SAMPLE_TIME)
1919                 size += sizeof(data->time);
1920
1921         if (sample_type & PERF_SAMPLE_IDENTIFIER)
1922                 size += sizeof(data->id);
1923
1924         if (sample_type & PERF_SAMPLE_ID)
1925                 size += sizeof(data->id);
1926
1927         if (sample_type & PERF_SAMPLE_STREAM_ID)
1928                 size += sizeof(data->stream_id);
1929
1930         if (sample_type & PERF_SAMPLE_CPU)
1931                 size += sizeof(data->cpu_entry);
1932
1933         event->id_header_size = size;
1934 }
1935
1936 static bool perf_event_validate_size(struct perf_event *event)
1937 {
1938         /*
1939          * The values computed here will be over-written when we actually
1940          * attach the event.
1941          */
1942         __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1943         __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1944         perf_event__id_header_size(event);
1945
1946         /*
1947          * Sum the lot; should not exceed the 64k limit we have on records.
1948          * Conservative limit to allow for callchains and other variable fields.
1949          */
1950         if (event->read_size + event->header_size +
1951             event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1952                 return false;
1953
1954         return true;
1955 }
1956
1957 static void perf_group_attach(struct perf_event *event)
1958 {
1959         struct perf_event *group_leader = event->group_leader, *pos;
1960
1961         lockdep_assert_held(&event->ctx->lock);
1962
1963         /*
1964          * We can have double attach due to group movement in perf_event_open.
1965          */
1966         if (event->attach_state & PERF_ATTACH_GROUP)
1967                 return;
1968
1969         event->attach_state |= PERF_ATTACH_GROUP;
1970
1971         if (group_leader == event)
1972                 return;
1973
1974         WARN_ON_ONCE(group_leader->ctx != event->ctx);
1975
1976         group_leader->group_caps &= event->event_caps;
1977
1978         list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1979         group_leader->nr_siblings++;
1980
1981         perf_event__header_size(group_leader);
1982
1983         for_each_sibling_event(pos, group_leader)
1984                 perf_event__header_size(pos);
1985 }
1986
1987 /*
1988  * Remove an event from the lists for its context.
1989  * Must be called with ctx->mutex and ctx->lock held.
1990  */
1991 static void
1992 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1993 {
1994         WARN_ON_ONCE(event->ctx != ctx);
1995         lockdep_assert_held(&ctx->lock);
1996
1997         /*
1998          * We can have double detach due to exit/hot-unplug + close.
1999          */
2000         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
2001                 return;
2002
2003         event->attach_state &= ~PERF_ATTACH_CONTEXT;
2004
2005         ctx->nr_events--;
2006         if (event->attr.inherit_stat)
2007                 ctx->nr_stat--;
2008
2009         list_del_rcu(&event->event_entry);
2010
2011         if (event->group_leader == event)
2012                 del_event_from_groups(event, ctx);
2013
2014         /*
2015          * If event was in error state, then keep it
2016          * that way, otherwise bogus counts will be
2017          * returned on read(). The only way to get out
2018          * of error state is by explicit re-enabling
2019          * of the event
2020          */
2021         if (event->state > PERF_EVENT_STATE_OFF) {
2022                 perf_cgroup_event_disable(event, ctx);
2023                 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2024         }
2025
2026         ctx->generation++;
2027 }
2028
2029 static int
2030 perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
2031 {
2032         if (!has_aux(aux_event))
2033                 return 0;
2034
2035         if (!event->pmu->aux_output_match)
2036                 return 0;
2037
2038         return event->pmu->aux_output_match(aux_event);
2039 }
2040
2041 static void put_event(struct perf_event *event);
2042 static void event_sched_out(struct perf_event *event,
2043                             struct perf_cpu_context *cpuctx,
2044                             struct perf_event_context *ctx);
2045
2046 static void perf_put_aux_event(struct perf_event *event)
2047 {
2048         struct perf_event_context *ctx = event->ctx;
2049         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2050         struct perf_event *iter;
2051
2052         /*
2053          * If event uses aux_event tear down the link
2054          */
2055         if (event->aux_event) {
2056                 iter = event->aux_event;
2057                 event->aux_event = NULL;
2058                 put_event(iter);
2059                 return;
2060         }
2061
2062         /*
2063          * If the event is an aux_event, tear down all links to
2064          * it from other events.
2065          */
2066         for_each_sibling_event(iter, event->group_leader) {
2067                 if (iter->aux_event != event)
2068                         continue;
2069
2070                 iter->aux_event = NULL;
2071                 put_event(event);
2072
2073                 /*
2074                  * If it's ACTIVE, schedule it out and put it into ERROR
2075                  * state so that we don't try to schedule it again. Note
2076                  * that perf_event_enable() will clear the ERROR status.
2077                  */
2078                 event_sched_out(iter, cpuctx, ctx);
2079                 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2080         }
2081 }
2082
2083 static bool perf_need_aux_event(struct perf_event *event)
2084 {
2085         return !!event->attr.aux_output || !!event->attr.aux_sample_size;
2086 }
2087
2088 static int perf_get_aux_event(struct perf_event *event,
2089                               struct perf_event *group_leader)
2090 {
2091         /*
2092          * Our group leader must be an aux event if we want to be
2093          * an aux_output. This way, the aux event will precede its
2094          * aux_output events in the group, and therefore will always
2095          * schedule first.
2096          */
2097         if (!group_leader)
2098                 return 0;
2099
2100         /*
2101          * aux_output and aux_sample_size are mutually exclusive.
2102          */
2103         if (event->attr.aux_output && event->attr.aux_sample_size)
2104                 return 0;
2105
2106         if (event->attr.aux_output &&
2107             !perf_aux_output_match(event, group_leader))
2108                 return 0;
2109
2110         if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
2111                 return 0;
2112
2113         if (!atomic_long_inc_not_zero(&group_leader->refcount))
2114                 return 0;
2115
2116         /*
2117          * Link aux_outputs to their aux event; this is undone in
2118          * perf_group_detach() by perf_put_aux_event(). When the
2119          * group in torn down, the aux_output events loose their
2120          * link to the aux_event and can't schedule any more.
2121          */
2122         event->aux_event = group_leader;
2123
2124         return 1;
2125 }
2126
2127 static inline struct list_head *get_event_list(struct perf_event *event)
2128 {
2129         struct perf_event_context *ctx = event->ctx;
2130         return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
2131 }
2132
2133 /*
2134  * Events that have PERF_EV_CAP_SIBLING require being part of a group and
2135  * cannot exist on their own, schedule them out and move them into the ERROR
2136  * state. Also see _perf_event_enable(), it will not be able to recover
2137  * this ERROR state.
2138  */
2139 static inline void perf_remove_sibling_event(struct perf_event *event)
2140 {
2141         struct perf_event_context *ctx = event->ctx;
2142         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2143
2144         event_sched_out(event, cpuctx, ctx);
2145         perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
2146 }
2147
2148 static void perf_group_detach(struct perf_event *event)
2149 {
2150         struct perf_event *leader = event->group_leader;
2151         struct perf_event *sibling, *tmp;
2152         struct perf_event_context *ctx = event->ctx;
2153
2154         lockdep_assert_held(&ctx->lock);
2155
2156         /*
2157          * We can have double detach due to exit/hot-unplug + close.
2158          */
2159         if (!(event->attach_state & PERF_ATTACH_GROUP))
2160                 return;
2161
2162         event->attach_state &= ~PERF_ATTACH_GROUP;
2163
2164         perf_put_aux_event(event);
2165
2166         /*
2167          * If this is a sibling, remove it from its group.
2168          */
2169         if (leader != event) {
2170                 list_del_init(&event->sibling_list);
2171                 event->group_leader->nr_siblings--;
2172                 goto out;
2173         }
2174
2175         /*
2176          * If this was a group event with sibling events then
2177          * upgrade the siblings to singleton events by adding them
2178          * to whatever list we are on.
2179          */
2180         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
2181
2182                 if (sibling->event_caps & PERF_EV_CAP_SIBLING)
2183                         perf_remove_sibling_event(sibling);
2184
2185                 sibling->group_leader = sibling;
2186                 list_del_init(&sibling->sibling_list);
2187
2188                 /* Inherit group flags from the previous leader */
2189                 sibling->group_caps = event->group_caps;
2190
2191                 if (!RB_EMPTY_NODE(&event->group_node)) {
2192                         add_event_to_groups(sibling, event->ctx);
2193
2194                         if (sibling->state == PERF_EVENT_STATE_ACTIVE)
2195                                 list_add_tail(&sibling->active_list, get_event_list(sibling));
2196                 }
2197
2198                 WARN_ON_ONCE(sibling->ctx != event->ctx);
2199         }
2200
2201 out:
2202         for_each_sibling_event(tmp, leader)
2203                 perf_event__header_size(tmp);
2204
2205         perf_event__header_size(leader);
2206 }
2207
2208 static void sync_child_event(struct perf_event *child_event);
2209
2210 static void perf_child_detach(struct perf_event *event)
2211 {
2212         struct perf_event *parent_event = event->parent;
2213
2214         if (!(event->attach_state & PERF_ATTACH_CHILD))
2215                 return;
2216
2217         event->attach_state &= ~PERF_ATTACH_CHILD;
2218
2219         if (WARN_ON_ONCE(!parent_event))
2220                 return;
2221
2222         lockdep_assert_held(&parent_event->child_mutex);
2223
2224         sync_child_event(event);
2225         list_del_init(&event->child_list);
2226 }
2227
2228 static bool is_orphaned_event(struct perf_event *event)
2229 {
2230         return event->state == PERF_EVENT_STATE_DEAD;
2231 }
2232
2233 static inline int __pmu_filter_match(struct perf_event *event)
2234 {
2235         struct pmu *pmu = event->pmu;
2236         return pmu->filter_match ? pmu->filter_match(event) : 1;
2237 }
2238
2239 /*
2240  * Check whether we should attempt to schedule an event group based on
2241  * PMU-specific filtering. An event group can consist of HW and SW events,
2242  * potentially with a SW leader, so we must check all the filters, to
2243  * determine whether a group is schedulable:
2244  */
2245 static inline int pmu_filter_match(struct perf_event *event)
2246 {
2247         struct perf_event *sibling;
2248
2249         if (!__pmu_filter_match(event))
2250                 return 0;
2251
2252         for_each_sibling_event(sibling, event) {
2253                 if (!__pmu_filter_match(sibling))
2254                         return 0;
2255         }
2256
2257         return 1;
2258 }
2259
2260 static inline int
2261 event_filter_match(struct perf_event *event)
2262 {
2263         return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
2264                perf_cgroup_match(event) && pmu_filter_match(event);
2265 }
2266
2267 static void
2268 event_sched_out(struct perf_event *event,
2269                   struct perf_cpu_context *cpuctx,
2270                   struct perf_event_context *ctx)
2271 {
2272         enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
2273
2274         WARN_ON_ONCE(event->ctx != ctx);
2275         lockdep_assert_held(&ctx->lock);
2276
2277         if (event->state != PERF_EVENT_STATE_ACTIVE)
2278                 return;
2279
2280         /*
2281          * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
2282          * we can schedule events _OUT_ individually through things like
2283          * __perf_remove_from_context().
2284          */
2285         list_del_init(&event->active_list);
2286
2287         perf_pmu_disable(event->pmu);
2288
2289         event->pmu->del(event, 0);
2290         event->oncpu = -1;
2291
2292         if (READ_ONCE(event->pending_disable) >= 0) {
2293                 WRITE_ONCE(event->pending_disable, -1);
2294                 perf_cgroup_event_disable(event, ctx);
2295                 state = PERF_EVENT_STATE_OFF;
2296         }
2297         perf_event_set_state(event, state);
2298
2299         if (!is_software_event(event))
2300                 cpuctx->active_oncpu--;
2301         if (!--ctx->nr_active)
2302                 perf_event_ctx_deactivate(ctx);
2303         if (event->attr.freq && event->attr.sample_freq)
2304                 ctx->nr_freq--;
2305         if (event->attr.exclusive || !cpuctx->active_oncpu)
2306                 cpuctx->exclusive = 0;
2307
2308         perf_pmu_enable(event->pmu);
2309 }
2310
2311 static void
2312 group_sched_out(struct perf_event *group_event,
2313                 struct perf_cpu_context *cpuctx,
2314                 struct perf_event_context *ctx)
2315 {
2316         struct perf_event *event;
2317
2318         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2319                 return;
2320
2321         perf_pmu_disable(ctx->pmu);
2322
2323         event_sched_out(group_event, cpuctx, ctx);
2324
2325         /*
2326          * Schedule out siblings (if any):
2327          */
2328         for_each_sibling_event(event, group_event)
2329                 event_sched_out(event, cpuctx, ctx);
2330
2331         perf_pmu_enable(ctx->pmu);
2332 }
2333
2334 #define DETACH_GROUP    0x01UL
2335 #define DETACH_CHILD    0x02UL
2336
2337 /*
2338  * Cross CPU call to remove a performance event
2339  *
2340  * We disable the event on the hardware level first. After that we
2341  * remove it from the context list.
2342  */
2343 static void
2344 __perf_remove_from_context(struct perf_event *event,
2345                            struct perf_cpu_context *cpuctx,
2346                            struct perf_event_context *ctx,
2347                            void *info)
2348 {
2349         unsigned long flags = (unsigned long)info;
2350
2351         if (ctx->is_active & EVENT_TIME) {
2352                 update_context_time(ctx);
2353                 update_cgrp_time_from_cpuctx(cpuctx);
2354         }
2355
2356         event_sched_out(event, cpuctx, ctx);
2357         if (flags & DETACH_GROUP)
2358                 perf_group_detach(event);
2359         if (flags & DETACH_CHILD)
2360                 perf_child_detach(event);
2361         list_del_event(event, ctx);
2362
2363         if (!ctx->nr_events && ctx->is_active) {
2364                 ctx->is_active = 0;
2365                 ctx->rotate_necessary = 0;
2366                 if (ctx->task) {
2367                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2368                         cpuctx->task_ctx = NULL;
2369                 }
2370         }
2371 }
2372
2373 /*
2374  * Remove the event from a task's (or a CPU's) list of events.
2375  *
2376  * If event->ctx is a cloned context, callers must make sure that
2377  * every task struct that event->ctx->task could possibly point to
2378  * remains valid.  This is OK when called from perf_release since
2379  * that only calls us on the top-level context, which can't be a clone.
2380  * When called from perf_event_exit_task, it's OK because the
2381  * context has been detached from its task.
2382  */
2383 static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2384 {
2385         struct perf_event_context *ctx = event->ctx;
2386
2387         lockdep_assert_held(&ctx->mutex);
2388
2389         /*
2390          * Because of perf_event_exit_task(), perf_remove_from_context() ought
2391          * to work in the face of TASK_TOMBSTONE, unlike every other
2392          * event_function_call() user.
2393          */
2394         raw_spin_lock_irq(&ctx->lock);
2395         if (!ctx->is_active) {
2396                 __perf_remove_from_context(event, __get_cpu_context(ctx),
2397                                            ctx, (void *)flags);
2398                 raw_spin_unlock_irq(&ctx->lock);
2399                 return;
2400         }
2401         raw_spin_unlock_irq(&ctx->lock);
2402
2403         event_function_call(event, __perf_remove_from_context, (void *)flags);
2404 }
2405
2406 /*
2407  * Cross CPU call to disable a performance event
2408  */
2409 static void __perf_event_disable(struct perf_event *event,
2410                                  struct perf_cpu_context *cpuctx,
2411                                  struct perf_event_context *ctx,
2412                                  void *info)
2413 {
2414         if (event->state < PERF_EVENT_STATE_INACTIVE)
2415                 return;
2416
2417         if (ctx->is_active & EVENT_TIME) {
2418                 update_context_time(ctx);
2419                 update_cgrp_time_from_event(event);
2420         }
2421
2422         if (event == event->group_leader)
2423                 group_sched_out(event, cpuctx, ctx);
2424         else
2425                 event_sched_out(event, cpuctx, ctx);
2426
2427         perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2428         perf_cgroup_event_disable(event, ctx);
2429 }
2430
2431 /*
2432  * Disable an event.
2433  *
2434  * If event->ctx is a cloned context, callers must make sure that
2435  * every task struct that event->ctx->task could possibly point to
2436  * remains valid.  This condition is satisfied when called through
2437  * perf_event_for_each_child or perf_event_for_each because they
2438  * hold the top-level event's child_mutex, so any descendant that
2439  * goes to exit will block in perf_event_exit_event().
2440  *
2441  * When called from perf_pending_event it's OK because event->ctx
2442  * is the current context on this CPU and preemption is disabled,
2443  * hence we can't get into perf_event_task_sched_out for this context.
2444  */
2445 static void _perf_event_disable(struct perf_event *event)
2446 {
2447         struct perf_event_context *ctx = event->ctx;
2448
2449         raw_spin_lock_irq(&ctx->lock);
2450         if (event->state <= PERF_EVENT_STATE_OFF) {
2451                 raw_spin_unlock_irq(&ctx->lock);
2452                 return;
2453         }
2454         raw_spin_unlock_irq(&ctx->lock);
2455
2456         event_function_call(event, __perf_event_disable, NULL);
2457 }
2458
2459 void perf_event_disable_local(struct perf_event *event)
2460 {
2461         event_function_local(event, __perf_event_disable, NULL);
2462 }
2463
2464 /*
2465  * Strictly speaking kernel users cannot create groups and therefore this
2466  * interface does not need the perf_event_ctx_lock() magic.
2467  */
2468 void perf_event_disable(struct perf_event *event)
2469 {
2470         struct perf_event_context *ctx;
2471
2472         ctx = perf_event_ctx_lock(event);
2473         _perf_event_disable(event);
2474         perf_event_ctx_unlock(event, ctx);
2475 }
2476 EXPORT_SYMBOL_GPL(perf_event_disable);
2477
2478 void perf_event_disable_inatomic(struct perf_event *event)
2479 {
2480         WRITE_ONCE(event->pending_disable, smp_processor_id());
2481         /* can fail, see perf_pending_event_disable() */
2482         irq_work_queue(&event->pending);
2483 }
2484
2485 static void perf_set_shadow_time(struct perf_event *event,
2486                                  struct perf_event_context *ctx)
2487 {
2488         /*
2489          * use the correct time source for the time snapshot
2490          *
2491          * We could get by without this by leveraging the
2492          * fact that to get to this function, the caller
2493          * has most likely already called update_context_time()
2494          * and update_cgrp_time_xx() and thus both timestamp
2495          * are identical (or very close). Given that tstamp is,
2496          * already adjusted for cgroup, we could say that:
2497          *    tstamp - ctx->timestamp
2498          * is equivalent to
2499          *    tstamp - cgrp->timestamp.
2500          *
2501          * Then, in perf_output_read(), the calculation would
2502          * work with no changes because:
2503          * - event is guaranteed scheduled in
2504          * - no scheduled out in between
2505          * - thus the timestamp would be the same
2506          *
2507          * But this is a bit hairy.
2508          *
2509          * So instead, we have an explicit cgroup call to remain
2510          * within the time source all along. We believe it
2511          * is cleaner and simpler to understand.
2512          */
2513         if (is_cgroup_event(event))
2514                 perf_cgroup_set_shadow_time(event, event->tstamp);
2515         else
2516                 event->shadow_ctx_time = event->tstamp - ctx->timestamp;
2517 }
2518
2519 #define MAX_INTERRUPTS (~0ULL)
2520
2521 static void perf_log_throttle(struct perf_event *event, int enable);
2522 static void perf_log_itrace_start(struct perf_event *event);
2523
2524 static int
2525 event_sched_in(struct perf_event *event,
2526                  struct perf_cpu_context *cpuctx,
2527                  struct perf_event_context *ctx)
2528 {
2529         int ret = 0;
2530
2531         WARN_ON_ONCE(event->ctx != ctx);
2532
2533         lockdep_assert_held(&ctx->lock);
2534
2535         if (event->state <= PERF_EVENT_STATE_OFF)
2536                 return 0;
2537
2538         WRITE_ONCE(event->oncpu, smp_processor_id());
2539         /*
2540          * Order event::oncpu write to happen before the ACTIVE state is
2541          * visible. This allows perf_event_{stop,read}() to observe the correct
2542          * ->oncpu if it sees ACTIVE.
2543          */
2544         smp_wmb();
2545         perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2546
2547         /*
2548          * Unthrottle events, since we scheduled we might have missed several
2549          * ticks already, also for a heavily scheduling task there is little
2550          * guarantee it'll get a tick in a timely manner.
2551          */
2552         if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2553                 perf_log_throttle(event, 1);
2554                 event->hw.interrupts = 0;
2555         }
2556
2557         perf_pmu_disable(event->pmu);
2558
2559         perf_set_shadow_time(event, ctx);
2560
2561         perf_log_itrace_start(event);
2562
2563         if (event->pmu->add(event, PERF_EF_START)) {
2564                 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2565                 event->oncpu = -1;
2566                 ret = -EAGAIN;
2567                 goto out;
2568         }
2569
2570         if (!is_software_event(event))
2571                 cpuctx->active_oncpu++;
2572         if (!ctx->nr_active++)
2573                 perf_event_ctx_activate(ctx);
2574         if (event->attr.freq && event->attr.sample_freq)
2575                 ctx->nr_freq++;
2576
2577         if (event->attr.exclusive)
2578                 cpuctx->exclusive = 1;
2579
2580 out:
2581         perf_pmu_enable(event->pmu);
2582
2583         return ret;
2584 }
2585
2586 static int
2587 group_sched_in(struct perf_event *group_event,
2588                struct perf_cpu_context *cpuctx,
2589                struct perf_event_context *ctx)
2590 {
2591         struct perf_event *event, *partial_group = NULL;
2592         struct pmu *pmu = ctx->pmu;
2593
2594         if (group_event->state == PERF_EVENT_STATE_OFF)
2595                 return 0;
2596
2597         pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2598
2599         if (event_sched_in(group_event, cpuctx, ctx))
2600                 goto error;
2601
2602         /*
2603          * Schedule in siblings as one group (if any):
2604          */
2605         for_each_sibling_event(event, group_event) {
2606                 if (event_sched_in(event, cpuctx, ctx)) {
2607                         partial_group = event;
2608                         goto group_error;
2609                 }
2610         }
2611
2612         if (!pmu->commit_txn(pmu))
2613                 return 0;
2614
2615 group_error:
2616         /*
2617          * Groups can be scheduled in as one unit only, so undo any
2618          * partial group before returning:
2619          * The events up to the failed event are scheduled out normally.
2620          */
2621         for_each_sibling_event(event, group_event) {
2622                 if (event == partial_group)
2623                         break;
2624
2625                 event_sched_out(event, cpuctx, ctx);
2626         }
2627         event_sched_out(group_event, cpuctx, ctx);
2628
2629 error:
2630         pmu->cancel_txn(pmu);
2631         return -EAGAIN;
2632 }
2633
2634 /*
2635  * Work out whether we can put this event group on the CPU now.
2636  */
2637 static int group_can_go_on(struct perf_event *event,
2638                            struct perf_cpu_context *cpuctx,
2639                            int can_add_hw)
2640 {
2641         /*
2642          * Groups consisting entirely of software events can always go on.
2643          */
2644         if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2645                 return 1;
2646         /*
2647          * If an exclusive group is already on, no other hardware
2648          * events can go on.
2649          */
2650         if (cpuctx->exclusive)
2651                 return 0;
2652         /*
2653          * If this group is exclusive and there are already
2654          * events on the CPU, it can't go on.
2655          */
2656         if (event->attr.exclusive && !list_empty(get_event_list(event)))
2657                 return 0;
2658         /*
2659          * Otherwise, try to add it if all previous groups were able
2660          * to go on.
2661          */
2662         return can_add_hw;
2663 }
2664
2665 static void add_event_to_ctx(struct perf_event *event,
2666                                struct perf_event_context *ctx)
2667 {
2668         list_add_event(event, ctx);
2669         perf_group_attach(event);
2670 }
2671
2672 static void ctx_sched_out(struct perf_event_context *ctx,
2673                           struct perf_cpu_context *cpuctx,
2674                           enum event_type_t event_type);
2675 static void
2676 ctx_sched_in(struct perf_event_context *ctx,
2677              struct perf_cpu_context *cpuctx,
2678              enum event_type_t event_type,
2679              struct task_struct *task);
2680
2681 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2682                                struct perf_event_context *ctx,
2683                                enum event_type_t event_type)
2684 {
2685         if (!cpuctx->task_ctx)
2686                 return;
2687
2688         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2689                 return;
2690
2691         ctx_sched_out(ctx, cpuctx, event_type);
2692 }
2693
2694 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2695                                 struct perf_event_context *ctx,
2696                                 struct task_struct *task)
2697 {
2698         cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2699         if (ctx)
2700                 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2701         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2702         if (ctx)
2703                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2704 }
2705
2706 /*
2707  * We want to maintain the following priority of scheduling:
2708  *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
2709  *  - task pinned (EVENT_PINNED)
2710  *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
2711  *  - task flexible (EVENT_FLEXIBLE).
2712  *
2713  * In order to avoid unscheduling and scheduling back in everything every
2714  * time an event is added, only do it for the groups of equal priority and
2715  * below.
2716  *
2717  * This can be called after a batch operation on task events, in which case
2718  * event_type is a bit mask of the types of events involved. For CPU events,
2719  * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
2720  */
2721 static void ctx_resched(struct perf_cpu_context *cpuctx,
2722                         struct perf_event_context *task_ctx,
2723                         enum event_type_t event_type)
2724 {
2725         enum event_type_t ctx_event_type;
2726         bool cpu_event = !!(event_type & EVENT_CPU);
2727
2728         /*
2729          * If pinned groups are involved, flexible groups also need to be
2730          * scheduled out.
2731          */
2732         if (event_type & EVENT_PINNED)
2733                 event_type |= EVENT_FLEXIBLE;
2734
2735         ctx_event_type = event_type & EVENT_ALL;
2736
2737         perf_pmu_disable(cpuctx->ctx.pmu);
2738         if (task_ctx)
2739                 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2740
2741         /*
2742          * Decide which cpu ctx groups to schedule out based on the types
2743          * of events that caused rescheduling:
2744          *  - EVENT_CPU: schedule out corresponding groups;
2745          *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
2746          *  - otherwise, do nothing more.
2747          */
2748         if (cpu_event)
2749                 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2750         else if (ctx_event_type & EVENT_PINNED)
2751                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2752
2753         perf_event_sched_in(cpuctx, task_ctx, current);
2754         perf_pmu_enable(cpuctx->ctx.pmu);
2755 }
2756
2757 void perf_pmu_resched(struct pmu *pmu)
2758 {
2759         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2760         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2761
2762         perf_ctx_lock(cpuctx, task_ctx);
2763         ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
2764         perf_ctx_unlock(cpuctx, task_ctx);
2765 }
2766
2767 /*
2768  * Cross CPU call to install and enable a performance event
2769  *
2770  * Very similar to remote_function() + event_function() but cannot assume that
2771  * things like ctx->is_active and cpuctx->task_ctx are set.
2772  */
2773 static int  __perf_install_in_context(void *info)
2774 {
2775         struct perf_event *event = info;
2776         struct perf_event_context *ctx = event->ctx;
2777         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2778         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2779         bool reprogram = true;
2780         int ret = 0;
2781
2782         raw_spin_lock(&cpuctx->ctx.lock);
2783         if (ctx->task) {
2784                 raw_spin_lock(&ctx->lock);
2785                 task_ctx = ctx;
2786
2787                 reprogram = (ctx->task == current);
2788
2789                 /*
2790                  * If the task is running, it must be running on this CPU,
2791                  * otherwise we cannot reprogram things.
2792                  *
2793                  * If its not running, we don't care, ctx->lock will
2794                  * serialize against it becoming runnable.
2795                  */
2796                 if (task_curr(ctx->task) && !reprogram) {
2797                         ret = -ESRCH;
2798                         goto unlock;
2799                 }
2800
2801                 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2802         } else if (task_ctx) {
2803                 raw_spin_lock(&task_ctx->lock);
2804         }
2805
2806 #ifdef CONFIG_CGROUP_PERF
2807         if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
2808                 /*
2809                  * If the current cgroup doesn't match the event's
2810                  * cgroup, we should not try to schedule it.
2811                  */
2812                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2813                 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2814                                         event->cgrp->css.cgroup);
2815         }
2816 #endif
2817
2818         if (reprogram) {
2819                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2820                 add_event_to_ctx(event, ctx);
2821                 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2822         } else {
2823                 add_event_to_ctx(event, ctx);
2824         }
2825
2826 unlock:
2827         perf_ctx_unlock(cpuctx, task_ctx);
2828
2829         return ret;
2830 }
2831
2832 static bool exclusive_event_installable(struct perf_event *event,
2833                                         struct perf_event_context *ctx);
2834
2835 /*
2836  * Attach a performance event to a context.
2837  *
2838  * Very similar to event_function_call, see comment there.
2839  */
2840 static void
2841 perf_install_in_context(struct perf_event_context *ctx,
2842                         struct perf_event *event,
2843                         int cpu)
2844 {
2845         struct task_struct *task = READ_ONCE(ctx->task);
2846
2847         lockdep_assert_held(&ctx->mutex);
2848
2849         WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
2850
2851         if (event->cpu != -1)
2852                 event->cpu = cpu;
2853
2854         /*
2855          * Ensures that if we can observe event->ctx, both the event and ctx
2856          * will be 'complete'. See perf_iterate_sb_cpu().
2857          */
2858         smp_store_release(&event->ctx, ctx);
2859
2860         /*
2861          * perf_event_attr::disabled events will not run and can be initialized
2862          * without IPI. Except when this is the first event for the context, in
2863          * that case we need the magic of the IPI to set ctx->is_active.
2864          *
2865          * The IOC_ENABLE that is sure to follow the creation of a disabled
2866          * event will issue the IPI and reprogram the hardware.
2867          */
2868         if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
2869                 raw_spin_lock_irq(&ctx->lock);
2870                 if (ctx->task == TASK_TOMBSTONE) {
2871                         raw_spin_unlock_irq(&ctx->lock);
2872                         return;
2873                 }
2874                 add_event_to_ctx(event, ctx);
2875                 raw_spin_unlock_irq(&ctx->lock);
2876                 return;
2877         }
2878
2879         if (!task) {
2880                 cpu_function_call(cpu, __perf_install_in_context, event);
2881                 return;
2882         }
2883
2884         /*
2885          * Should not happen, we validate the ctx is still alive before calling.
2886          */
2887         if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2888                 return;
2889
2890         /*
2891          * Installing events is tricky because we cannot rely on ctx->is_active
2892          * to be set in case this is the nr_events 0 -> 1 transition.
2893          *
2894          * Instead we use task_curr(), which tells us if the task is running.
2895          * However, since we use task_curr() outside of rq::lock, we can race
2896          * against the actual state. This means the result can be wrong.
2897          *
2898          * If we get a false positive, we retry, this is harmless.
2899          *
2900          * If we get a false negative, things are complicated. If we are after
2901          * perf_event_context_sched_in() ctx::lock will serialize us, and the
2902          * value must be correct. If we're before, it doesn't matter since
2903          * perf_event_context_sched_in() will program the counter.
2904          *
2905          * However, this hinges on the remote context switch having observed
2906          * our task->perf_event_ctxp[] store, such that it will in fact take
2907          * ctx::lock in perf_event_context_sched_in().
2908          *
2909          * We do this by task_function_call(), if the IPI fails to hit the task
2910          * we know any future context switch of task must see the
2911          * perf_event_ctpx[] store.
2912          */
2913
2914         /*
2915          * This smp_mb() orders the task->perf_event_ctxp[] store with the
2916          * task_cpu() load, such that if the IPI then does not find the task
2917          * running, a future context switch of that task must observe the
2918          * store.
2919          */
2920         smp_mb();
2921 again:
2922         if (!task_function_call(task, __perf_install_in_context, event))
2923                 return;
2924
2925         raw_spin_lock_irq(&ctx->lock);
2926         task = ctx->task;
2927         if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2928                 /*
2929                  * Cannot happen because we already checked above (which also
2930                  * cannot happen), and we hold ctx->mutex, which serializes us
2931                  * against perf_event_exit_task_context().
2932                  */
2933                 raw_spin_unlock_irq(&ctx->lock);
2934                 return;
2935         }
2936         /*
2937          * If the task is not running, ctx->lock will avoid it becoming so,
2938          * thus we can safely install the event.
2939          */
2940         if (task_curr(task)) {
2941                 raw_spin_unlock_irq(&ctx->lock);
2942                 goto again;
2943         }
2944         add_event_to_ctx(event, ctx);
2945         raw_spin_unlock_irq(&ctx->lock);
2946 }
2947
2948 /*
2949  * Cross CPU call to enable a performance event
2950  */
2951 static void __perf_event_enable(struct perf_event *event,
2952                                 struct perf_cpu_context *cpuctx,
2953                                 struct perf_event_context *ctx,
2954                                 void *info)
2955 {
2956         struct perf_event *leader = event->group_leader;
2957         struct perf_event_context *task_ctx;
2958
2959         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2960             event->state <= PERF_EVENT_STATE_ERROR)
2961                 return;
2962
2963         if (ctx->is_active)
2964                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2965
2966         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2967         perf_cgroup_event_enable(event, ctx);
2968
2969         if (!ctx->is_active)
2970                 return;
2971
2972         if (!event_filter_match(event)) {
2973                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2974                 return;
2975         }
2976
2977         /*
2978          * If the event is in a group and isn't the group leader,
2979          * then don't put it on unless the group is on.
2980          */
2981         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2982                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2983                 return;
2984         }
2985
2986         task_ctx = cpuctx->task_ctx;
2987         if (ctx->task)
2988                 WARN_ON_ONCE(task_ctx != ctx);
2989
2990         ctx_resched(cpuctx, task_ctx, get_event_type(event));
2991 }
2992
2993 /*
2994  * Enable an event.
2995  *
2996  * If event->ctx is a cloned context, callers must make sure that
2997  * every task struct that event->ctx->task could possibly point to
2998  * remains valid.  This condition is satisfied when called through
2999  * perf_event_for_each_child or perf_event_for_each as described
3000  * for perf_event_disable.
3001  */
3002 static void _perf_event_enable(struct perf_event *event)
3003 {
3004         struct perf_event_context *ctx = event->ctx;
3005
3006         raw_spin_lock_irq(&ctx->lock);
3007         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
3008             event->state <  PERF_EVENT_STATE_ERROR) {
3009 out:
3010                 raw_spin_unlock_irq(&ctx->lock);
3011                 return;
3012         }
3013
3014         /*
3015          * If the event is in error state, clear that first.
3016          *
3017          * That way, if we see the event in error state below, we know that it
3018          * has gone back into error state, as distinct from the task having
3019          * been scheduled away before the cross-call arrived.
3020          */
3021         if (event->state == PERF_EVENT_STATE_ERROR) {
3022                 /*
3023                  * Detached SIBLING events cannot leave ERROR state.
3024                  */
3025                 if (event->event_caps & PERF_EV_CAP_SIBLING &&
3026                     event->group_leader == event)
3027                         goto out;
3028
3029                 event->state = PERF_EVENT_STATE_OFF;
3030         }
3031         raw_spin_unlock_irq(&ctx->lock);
3032
3033         event_function_call(event, __perf_event_enable, NULL);
3034 }
3035
3036 /*
3037  * See perf_event_disable();
3038  */
3039 void perf_event_enable(struct perf_event *event)
3040 {
3041         struct perf_event_context *ctx;
3042
3043         ctx = perf_event_ctx_lock(event);
3044         _perf_event_enable(event);
3045         perf_event_ctx_unlock(event, ctx);
3046 }
3047 EXPORT_SYMBOL_GPL(perf_event_enable);
3048
3049 struct stop_event_data {
3050         struct perf_event       *event;
3051         unsigned int            restart;
3052 };
3053
3054 static int __perf_event_stop(void *info)
3055 {
3056         struct stop_event_data *sd = info;
3057         struct perf_event *event = sd->event;
3058
3059         /* if it's already INACTIVE, do nothing */
3060         if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3061                 return 0;
3062
3063         /* matches smp_wmb() in event_sched_in() */
3064         smp_rmb();
3065
3066         /*
3067          * There is a window with interrupts enabled before we get here,
3068          * so we need to check again lest we try to stop another CPU's event.
3069          */
3070         if (READ_ONCE(event->oncpu) != smp_processor_id())
3071                 return -EAGAIN;
3072
3073         event->pmu->stop(event, PERF_EF_UPDATE);
3074
3075         /*
3076          * May race with the actual stop (through perf_pmu_output_stop()),
3077          * but it is only used for events with AUX ring buffer, and such
3078          * events will refuse to restart because of rb::aux_mmap_count==0,
3079          * see comments in perf_aux_output_begin().
3080          *
3081          * Since this is happening on an event-local CPU, no trace is lost
3082          * while restarting.
3083          */
3084         if (sd->restart)
3085                 event->pmu->start(event, 0);
3086
3087         return 0;
3088 }
3089
3090 static int perf_event_stop(struct perf_event *event, int restart)
3091 {
3092         struct stop_event_data sd = {
3093                 .event          = event,
3094                 .restart        = restart,
3095         };
3096         int ret = 0;
3097
3098         do {
3099                 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
3100                         return 0;
3101
3102                 /* matches smp_wmb() in event_sched_in() */
3103                 smp_rmb();
3104
3105                 /*
3106                  * We only want to restart ACTIVE events, so if the event goes
3107                  * inactive here (event->oncpu==-1), there's nothing more to do;
3108                  * fall through with ret==-ENXIO.
3109                  */
3110                 ret = cpu_function_call(READ_ONCE(event->oncpu),
3111                                         __perf_event_stop, &sd);
3112         } while (ret == -EAGAIN);
3113
3114         return ret;
3115 }
3116
3117 /*
3118  * In order to contain the amount of racy and tricky in the address filter
3119  * configuration management, it is a two part process:
3120  *
3121  * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
3122  *      we update the addresses of corresponding vmas in
3123  *      event::addr_filter_ranges array and bump the event::addr_filters_gen;
3124  * (p2) when an event is scheduled in (pmu::add), it calls
3125  *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
3126  *      if the generation has changed since the previous call.
3127  *
3128  * If (p1) happens while the event is active, we restart it to force (p2).
3129  *
3130  * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
3131  *     pre-existing mappings, called once when new filters arrive via SET_FILTER
3132  *     ioctl;
3133  * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
3134  *     registered mapping, called for every new mmap(), with mm::mmap_lock down
3135  *     for reading;
3136  * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
3137  *     of exec.
3138  */
3139 void perf_event_addr_filters_sync(struct perf_event *event)
3140 {
3141         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
3142
3143         if (!has_addr_filter(event))
3144                 return;
3145
3146         raw_spin_lock(&ifh->lock);
3147         if (event->addr_filters_gen != event->hw.addr_filters_gen) {
3148                 event->pmu->addr_filters_sync(event);
3149                 event->hw.addr_filters_gen = event->addr_filters_gen;
3150         }
3151         raw_spin_unlock(&ifh->lock);
3152 }
3153 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
3154
3155 static int _perf_event_refresh(struct perf_event *event, int refresh)
3156 {
3157         /*
3158          * not supported on inherited events
3159          */
3160         if (event->attr.inherit || !is_sampling_event(event))
3161                 return -EINVAL;
3162
3163         atomic_add(refresh, &event->event_limit);
3164         _perf_event_enable(event);
3165
3166         return 0;
3167 }
3168
3169 /*
3170  * See perf_event_disable()
3171  */
3172 int perf_event_refresh(struct perf_event *event, int refresh)
3173 {
3174         struct perf_event_context *ctx;
3175         int ret;
3176
3177         ctx = perf_event_ctx_lock(event);
3178         ret = _perf_event_refresh(event, refresh);
3179         perf_event_ctx_unlock(event, ctx);
3180
3181         return ret;
3182 }
3183 EXPORT_SYMBOL_GPL(perf_event_refresh);
3184
3185 static int perf_event_modify_breakpoint(struct perf_event *bp,
3186                                          struct perf_event_attr *attr)
3187 {
3188         int err;
3189
3190         _perf_event_disable(bp);
3191
3192         err = modify_user_hw_breakpoint_check(bp, attr, true);
3193
3194         if (!bp->attr.disabled)
3195                 _perf_event_enable(bp);
3196
3197         return err;
3198 }
3199
3200 static int perf_event_modify_attr(struct perf_event *event,
3201                                   struct perf_event_attr *attr)
3202 {
3203         if (event->attr.type != attr->type)
3204                 return -EINVAL;
3205
3206         switch (event->attr.type) {
3207         case PERF_TYPE_BREAKPOINT:
3208                 return perf_event_modify_breakpoint(event, attr);
3209         default:
3210                 /* Place holder for future additions. */
3211                 return -EOPNOTSUPP;
3212         }
3213 }
3214
3215 static void ctx_sched_out(struct perf_event_context *ctx,
3216                           struct perf_cpu_context *cpuctx,
3217                           enum event_type_t event_type)
3218 {
3219         struct perf_event *event, *tmp;
3220         int is_active = ctx->is_active;
3221
3222         lockdep_assert_held(&ctx->lock);
3223
3224         if (likely(!ctx->nr_events)) {
3225                 /*
3226                  * See __perf_remove_from_context().
3227                  */
3228                 WARN_ON_ONCE(ctx->is_active);
3229                 if (ctx->task)
3230                         WARN_ON_ONCE(cpuctx->task_ctx);
3231                 return;
3232         }
3233
3234         ctx->is_active &= ~event_type;
3235         if (!(ctx->is_active & EVENT_ALL))
3236                 ctx->is_active = 0;
3237
3238         if (ctx->task) {
3239                 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3240                 if (!ctx->is_active)
3241                         cpuctx->task_ctx = NULL;
3242         }
3243
3244         /*
3245          * Always update time if it was set; not only when it changes.
3246          * Otherwise we can 'forget' to update time for any but the last
3247          * context we sched out. For example:
3248          *
3249          *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
3250          *   ctx_sched_out(.event_type = EVENT_PINNED)
3251          *
3252          * would only update time for the pinned events.
3253          */
3254         if (is_active & EVENT_TIME) {
3255                 /* update (and stop) ctx time */
3256                 update_context_time(ctx);
3257                 update_cgrp_time_from_cpuctx(cpuctx);
3258         }
3259
3260         is_active ^= ctx->is_active; /* changed bits */
3261
3262         if (!ctx->nr_active || !(is_active & EVENT_ALL))
3263                 return;
3264
3265         perf_pmu_disable(ctx->pmu);
3266         if (is_active & EVENT_PINNED) {
3267                 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
3268                         group_sched_out(event, cpuctx, ctx);
3269         }
3270
3271         if (is_active & EVENT_FLEXIBLE) {
3272                 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
3273                         group_sched_out(event, cpuctx, ctx);
3274
3275                 /*
3276                  * Since we cleared EVENT_FLEXIBLE, also clear
3277                  * rotate_necessary, is will be reset by
3278                  * ctx_flexible_sched_in() when needed.
3279                  */
3280                 ctx->rotate_necessary = 0;
3281         }
3282         perf_pmu_enable(ctx->pmu);
3283 }
3284
3285 /*
3286  * Test whether two contexts are equivalent, i.e. whether they have both been
3287  * cloned from the same version of the same context.
3288  *
3289  * Equivalence is measured using a generation number in the context that is
3290  * incremented on each modification to it; see unclone_ctx(), list_add_event()
3291  * and list_del_event().
3292  */
3293 static int context_equiv(struct perf_event_context *ctx1,
3294                          struct perf_event_context *ctx2)
3295 {
3296         lockdep_assert_held(&ctx1->lock);
3297         lockdep_assert_held(&ctx2->lock);
3298
3299         /* Pinning disables the swap optimization */
3300         if (ctx1->pin_count || ctx2->pin_count)
3301                 return 0;
3302
3303         /* If ctx1 is the parent of ctx2 */
3304         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
3305                 return 1;
3306
3307         /* If ctx2 is the parent of ctx1 */
3308         if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
3309                 return 1;
3310
3311         /*
3312          * If ctx1 and ctx2 have the same parent; we flatten the parent
3313          * hierarchy, see perf_event_init_context().
3314          */
3315         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3316                         ctx1->parent_gen == ctx2->parent_gen)
3317                 return 1;
3318
3319         /* Unmatched */
3320         return 0;
3321 }
3322
3323 static void __perf_event_sync_stat(struct perf_event *event,
3324                                      struct perf_event *next_event)
3325 {
3326         u64 value;
3327
3328         if (!event->attr.inherit_stat)
3329                 return;
3330
3331         /*
3332          * Update the event value, we cannot use perf_event_read()
3333          * because we're in the middle of a context switch and have IRQs
3334          * disabled, which upsets smp_call_function_single(), however
3335          * we know the event must be on the current CPU, therefore we
3336          * don't need to use it.
3337          */
3338         if (event->state == PERF_EVENT_STATE_ACTIVE)
3339                 event->pmu->read(event);
3340
3341         perf_event_update_time(event);
3342
3343         /*
3344          * In order to keep per-task stats reliable we need to flip the event
3345          * values when we flip the contexts.
3346          */
3347         value = local64_read(&next_event->count);
3348         value = local64_xchg(&event->count, value);
3349         local64_set(&next_event->count, value);
3350
3351         swap(event->total_time_enabled, next_event->total_time_enabled);
3352         swap(event->total_time_running, next_event->total_time_running);
3353
3354         /*
3355          * Since we swizzled the values, update the user visible data too.
3356          */
3357         perf_event_update_userpage(event);
3358         perf_event_update_userpage(next_event);
3359 }
3360
3361 static void perf_event_sync_stat(struct perf_event_context *ctx,
3362                                    struct perf_event_context *next_ctx)
3363 {
3364         struct perf_event *event, *next_event;
3365
3366         if (!ctx->nr_stat)
3367                 return;
3368
3369         update_context_time(ctx);
3370
3371         event = list_first_entry(&ctx->event_list,
3372                                    struct perf_event, event_entry);
3373
3374         next_event = list_first_entry(&next_ctx->event_list,
3375                                         struct perf_event, event_entry);
3376
3377         while (&event->event_entry != &ctx->event_list &&
3378                &next_event->event_entry != &next_ctx->event_list) {
3379
3380                 __perf_event_sync_stat(event, next_event);
3381
3382                 event = list_next_entry(event, event_entry);
3383                 next_event = list_next_entry(next_event, event_entry);
3384         }
3385 }
3386
3387 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3388                                          struct task_struct *next)
3389 {
3390         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
3391         struct perf_event_context *next_ctx;
3392         struct perf_event_context *parent, *next_parent;
3393         struct perf_cpu_context *cpuctx;
3394         int do_switch = 1;
3395         struct pmu *pmu;
3396
3397         if (likely(!ctx))
3398                 return;
3399
3400         pmu = ctx->pmu;
3401         cpuctx = __get_cpu_context(ctx);
3402         if (!cpuctx->task_ctx)
3403                 return;
3404
3405         rcu_read_lock();
3406         next_ctx = next->perf_event_ctxp[ctxn];
3407         if (!next_ctx)
3408                 goto unlock;
3409
3410         parent = rcu_dereference(ctx->parent_ctx);
3411         next_parent = rcu_dereference(next_ctx->parent_ctx);
3412
3413         /* If neither context have a parent context; they cannot be clones. */
3414         if (!parent && !next_parent)
3415                 goto unlock;
3416
3417         if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3418                 /*
3419                  * Looks like the two contexts are clones, so we might be
3420                  * able to optimize the context switch.  We lock both
3421                  * contexts and check that they are clones under the
3422                  * lock (including re-checking that neither has been
3423                  * uncloned in the meantime).  It doesn't matter which
3424                  * order we take the locks because no other cpu could
3425                  * be trying to lock both of these tasks.
3426                  */
3427                 raw_spin_lock(&ctx->lock);
3428                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3429                 if (context_equiv(ctx, next_ctx)) {
3430
3431                         WRITE_ONCE(ctx->task, next);
3432                         WRITE_ONCE(next_ctx->task, task);
3433
3434                         perf_pmu_disable(pmu);
3435
3436                         if (cpuctx->sched_cb_usage && pmu->sched_task)
3437                                 pmu->sched_task(ctx, false);
3438
3439                         /*
3440                          * PMU specific parts of task perf context can require
3441                          * additional synchronization. As an example of such
3442                          * synchronization see implementation details of Intel
3443                          * LBR call stack data profiling;
3444                          */
3445                         if (pmu->swap_task_ctx)
3446                                 pmu->swap_task_ctx(ctx, next_ctx);
3447                         else
3448                                 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3449
3450                         perf_pmu_enable(pmu);
3451
3452                         /*
3453                          * RCU_INIT_POINTER here is safe because we've not
3454                          * modified the ctx and the above modification of
3455                          * ctx->task and ctx->task_ctx_data are immaterial
3456                          * since those values are always verified under
3457                          * ctx->lock which we're now holding.
3458                          */
3459                         RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3460                         RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3461
3462                         do_switch = 0;
3463
3464                         perf_event_sync_stat(ctx, next_ctx);
3465                 }
3466                 raw_spin_unlock(&next_ctx->lock);
3467                 raw_spin_unlock(&ctx->lock);
3468         }
3469 unlock:
3470         rcu_read_unlock();
3471
3472         if (do_switch) {
3473                 raw_spin_lock(&ctx->lock);
3474                 perf_pmu_disable(pmu);
3475
3476                 if (cpuctx->sched_cb_usage && pmu->sched_task)
3477                         pmu->sched_task(ctx, false);
3478                 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3479
3480                 perf_pmu_enable(pmu);
3481                 raw_spin_unlock(&ctx->lock);
3482         }
3483 }
3484
3485 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3486
3487 void perf_sched_cb_dec(struct pmu *pmu)
3488 {
3489         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3490
3491         this_cpu_dec(perf_sched_cb_usages);
3492
3493         if (!--cpuctx->sched_cb_usage)
3494                 list_del(&cpuctx->sched_cb_entry);
3495 }
3496
3497
3498 void perf_sched_cb_inc(struct pmu *pmu)
3499 {
3500         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3501
3502         if (!cpuctx->sched_cb_usage++)
3503                 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3504
3505         this_cpu_inc(perf_sched_cb_usages);
3506 }
3507
3508 /*
3509  * This function provides the context switch callback to the lower code
3510  * layer. It is invoked ONLY when the context switch callback is enabled.
3511  *
3512  * This callback is relevant even to per-cpu events; for example multi event
3513  * PEBS requires this to provide PID/TID information. This requires we flush
3514  * all queued PEBS records before we context switch to a new task.
3515  */
3516 static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
3517 {
3518         struct pmu *pmu;
3519
3520         pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3521
3522         if (WARN_ON_ONCE(!pmu->sched_task))
3523                 return;
3524
3525         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3526         perf_pmu_disable(pmu);
3527
3528         pmu->sched_task(cpuctx->task_ctx, sched_in);
3529
3530         perf_pmu_enable(pmu);
3531         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3532 }
3533
3534 static void perf_pmu_sched_task(struct task_struct *prev,
3535                                 struct task_struct *next,
3536                                 bool sched_in)
3537 {
3538         struct perf_cpu_context *cpuctx;
3539
3540         if (prev == next)
3541                 return;
3542
3543         list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3544                 /* will be handled in perf_event_context_sched_in/out */
3545                 if (cpuctx->task_ctx)
3546                         continue;
3547
3548                 __perf_pmu_sched_task(cpuctx, sched_in);
3549         }
3550 }
3551
3552 static void perf_event_switch(struct task_struct *task,
3553                               struct task_struct *next_prev, bool sched_in);
3554
3555 #define for_each_task_context_nr(ctxn)                                  \
3556         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3557
3558 /*
3559  * Called from scheduler to remove the events of the current task,
3560  * with interrupts disabled.
3561  *
3562  * We stop each event and update the event value in event->count.
3563  *
3564  * This does not protect us against NMI, but disable()
3565  * sets the disabled bit in the control field of event _before_
3566  * accessing the event control register. If a NMI hits, then it will
3567  * not restart the event.
3568  */
3569 void __perf_event_task_sched_out(struct task_struct *task,
3570                                  struct task_struct *next)
3571 {
3572         int ctxn;
3573
3574         if (__this_cpu_read(perf_sched_cb_usages))
3575                 perf_pmu_sched_task(task, next, false);
3576
3577         if (atomic_read(&nr_switch_events))
3578                 perf_event_switch(task, next, false);
3579
3580         for_each_task_context_nr(ctxn)
3581                 perf_event_context_sched_out(task, ctxn, next);
3582
3583         /*
3584          * if cgroup events exist on this CPU, then we need
3585          * to check if we have to switch out PMU state.
3586          * cgroup event are system-wide mode only
3587          */
3588         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3589                 perf_cgroup_sched_out(task, next);
3590 }
3591
3592 /*
3593  * Called with IRQs disabled
3594  */
3595 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3596                               enum event_type_t event_type)
3597 {
3598         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3599 }
3600
3601 static bool perf_less_group_idx(const void *l, const void *r)
3602 {
3603         const struct perf_event *le = *(const struct perf_event **)l;
3604         const struct perf_event *re = *(const struct perf_event **)r;
3605
3606         return le->group_index < re->group_index;
3607 }
3608
3609 static void swap_ptr(void *l, void *r)
3610 {
3611         void **lp = l, **rp = r;
3612
3613         swap(*lp, *rp);
3614 }
3615
3616 static const struct min_heap_callbacks perf_min_heap = {
3617         .elem_size = sizeof(struct perf_event *),
3618         .less = perf_less_group_idx,
3619         .swp = swap_ptr,
3620 };
3621
3622 static void __heap_add(struct min_heap *heap, struct perf_event *event)
3623 {
3624         struct perf_event **itrs = heap->data;
3625
3626         if (event) {
3627                 itrs[heap->nr] = event;
3628                 heap->nr++;
3629         }
3630 }
3631
3632 static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
3633                                 struct perf_event_groups *groups, int cpu,
3634                                 int (*func)(struct perf_event *, void *),
3635                                 void *data)
3636 {
3637 #ifdef CONFIG_CGROUP_PERF
3638         struct cgroup_subsys_state *css = NULL;
3639 #endif
3640         /* Space for per CPU and/or any CPU event iterators. */
3641         struct perf_event *itrs[2];
3642         struct min_heap event_heap;
3643         struct perf_event **evt;
3644         int ret;
3645
3646         if (cpuctx) {
3647                 event_heap = (struct min_heap){
3648                         .data = cpuctx->heap,
3649                         .nr = 0,
3650                         .size = cpuctx->heap_size,
3651                 };
3652
3653                 lockdep_assert_held(&cpuctx->ctx.lock);
3654
3655 #ifdef CONFIG_CGROUP_PERF
3656                 if (cpuctx->cgrp)
3657                         css = &cpuctx->cgrp->css;
3658 #endif
3659         } else {
3660                 event_heap = (struct min_heap){
3661                         .data = itrs,
3662                         .nr = 0,
3663                         .size = ARRAY_SIZE(itrs),
3664                 };
3665                 /* Events not within a CPU context may be on any CPU. */
3666                 __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
3667         }
3668         evt = event_heap.data;
3669
3670         __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
3671
3672 #ifdef CONFIG_CGROUP_PERF
3673         for (; css; css = css->parent)
3674                 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
3675 #endif
3676
3677         min_heapify_all(&event_heap, &perf_min_heap);
3678
3679         while (event_heap.nr) {
3680                 ret = func(*evt, data);
3681                 if (ret)
3682                         return ret;
3683
3684                 *evt = perf_event_groups_next(*evt);
3685                 if (*evt)
3686                         min_heapify(&event_heap, 0, &perf_min_heap);
3687                 else
3688                         min_heap_pop(&event_heap, &perf_min_heap);
3689         }
3690
3691         return 0;
3692 }
3693
3694 static int merge_sched_in(struct perf_event *event, void *data)
3695 {
3696         struct perf_event_context *ctx = event->ctx;
3697         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3698         int *can_add_hw = data;
3699
3700         if (event->state <= PERF_EVENT_STATE_OFF)
3701                 return 0;
3702
3703         if (!event_filter_match(event))
3704                 return 0;
3705
3706         if (group_can_go_on(event, cpuctx, *can_add_hw)) {
3707                 if (!group_sched_in(event, cpuctx, ctx))
3708                         list_add_tail(&event->active_list, get_event_list(event));
3709         }
3710
3711         if (event->state == PERF_EVENT_STATE_INACTIVE) {
3712                 if (event->attr.pinned) {
3713                         perf_cgroup_event_disable(event, ctx);
3714                         perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3715                 }
3716
3717                 *can_add_hw = 0;
3718                 ctx->rotate_necessary = 1;
3719                 perf_mux_hrtimer_restart(cpuctx);
3720         }
3721
3722         return 0;
3723 }
3724
3725 static void
3726 ctx_pinned_sched_in(struct perf_event_context *ctx,
3727                     struct perf_cpu_context *cpuctx)
3728 {
3729         int can_add_hw = 1;
3730
3731         if (ctx != &cpuctx->ctx)
3732                 cpuctx = NULL;
3733
3734         visit_groups_merge(cpuctx, &ctx->pinned_groups,
3735                            smp_processor_id(),
3736                            merge_sched_in, &can_add_hw);
3737 }
3738
3739 static void
3740 ctx_flexible_sched_in(struct perf_event_context *ctx,
3741                       struct perf_cpu_context *cpuctx)
3742 {
3743         int can_add_hw = 1;
3744
3745         if (ctx != &cpuctx->ctx)
3746                 cpuctx = NULL;
3747
3748         visit_groups_merge(cpuctx, &ctx->flexible_groups,
3749                            smp_processor_id(),
3750                            merge_sched_in, &can_add_hw);
3751 }
3752
3753 static void
3754 ctx_sched_in(struct perf_event_context *ctx,
3755              struct perf_cpu_context *cpuctx,
3756              enum event_type_t event_type,
3757              struct task_struct *task)
3758 {
3759         int is_active = ctx->is_active;
3760         u64 now;
3761
3762         lockdep_assert_held(&ctx->lock);
3763
3764         if (likely(!ctx->nr_events))
3765                 return;
3766
3767         ctx->is_active |= (event_type | EVENT_TIME);
3768         if (ctx->task) {
3769                 if (!is_active)
3770                         cpuctx->task_ctx = ctx;
3771                 else
3772                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3773         }
3774
3775         is_active ^= ctx->is_active; /* changed bits */
3776
3777         if (is_active & EVENT_TIME) {
3778                 /* start ctx time */
3779                 now = perf_clock();
3780                 ctx->timestamp = now;
3781                 perf_cgroup_set_timestamp(task, ctx);
3782         }
3783
3784         /*
3785          * First go through the list and put on any pinned groups
3786          * in order to give them the best chance of going on.
3787          */
3788         if (is_active & EVENT_PINNED)
3789                 ctx_pinned_sched_in(ctx, cpuctx);
3790
3791         /* Then walk through the lower prio flexible groups */
3792         if (is_active & EVENT_FLEXIBLE)
3793                 ctx_flexible_sched_in(ctx, cpuctx);
3794 }
3795
3796 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3797                              enum event_type_t event_type,
3798                              struct task_struct *task)
3799 {
3800         struct perf_event_context *ctx = &cpuctx->ctx;
3801
3802         ctx_sched_in(ctx, cpuctx, event_type, task);
3803 }
3804
3805 static void perf_event_context_sched_in(struct perf_event_context *ctx,
3806                                         struct task_struct *task)
3807 {
3808         struct perf_cpu_context *cpuctx;
3809         struct pmu *pmu = ctx->pmu;
3810
3811         cpuctx = __get_cpu_context(ctx);
3812         if (cpuctx->task_ctx == ctx) {
3813                 if (cpuctx->sched_cb_usage)
3814                         __perf_pmu_sched_task(cpuctx, true);
3815                 return;
3816         }
3817
3818         perf_ctx_lock(cpuctx, ctx);
3819         /*
3820          * We must check ctx->nr_events while holding ctx->lock, such
3821          * that we serialize against perf_install_in_context().
3822          */
3823         if (!ctx->nr_events)
3824                 goto unlock;
3825
3826         perf_pmu_disable(pmu);
3827         /*
3828          * We want to keep the following priority order:
3829          * cpu pinned (that don't need to move), task pinned,
3830          * cpu flexible, task flexible.
3831          *
3832          * However, if task's ctx is not carrying any pinned
3833          * events, no need to flip the cpuctx's events around.
3834          */
3835         if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3836                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3837         perf_event_sched_in(cpuctx, ctx, task);
3838
3839         if (cpuctx->sched_cb_usage && pmu->sched_task)
3840                 pmu->sched_task(cpuctx->task_ctx, true);
3841
3842         perf_pmu_enable(pmu);
3843
3844 unlock:
3845         perf_ctx_unlock(cpuctx, ctx);
3846 }
3847
3848 /*
3849  * Called from scheduler to add the events of the current task
3850  * with interrupts disabled.
3851  *
3852  * We restore the event value and then enable it.
3853  *
3854  * This does not protect us against NMI, but enable()
3855  * sets the enabled bit in the control field of event _before_
3856  * accessing the event control register. If a NMI hits, then it will
3857  * keep the event running.
3858  */
3859 void __perf_event_task_sched_in(struct task_struct *prev,
3860                                 struct task_struct *task)
3861 {
3862         struct perf_event_context *ctx;
3863         int ctxn;
3864
3865         /*
3866          * If cgroup events exist on this CPU, then we need to check if we have
3867          * to switch in PMU state; cgroup event are system-wide mode only.
3868          *
3869          * Since cgroup events are CPU events, we must schedule these in before
3870          * we schedule in the task events.
3871          */
3872         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3873                 perf_cgroup_sched_in(prev, task);
3874
3875         for_each_task_context_nr(ctxn) {
3876                 ctx = task->perf_event_ctxp[ctxn];
3877                 if (likely(!ctx))
3878                         continue;
3879
3880                 perf_event_context_sched_in(ctx, task);
3881         }
3882
3883         if (atomic_read(&nr_switch_events))
3884                 perf_event_switch(task, prev, true);
3885
3886         if (__this_cpu_read(perf_sched_cb_usages))
3887                 perf_pmu_sched_task(prev, task, true);
3888 }
3889
3890 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3891 {
3892         u64 frequency = event->attr.sample_freq;
3893         u64 sec = NSEC_PER_SEC;
3894         u64 divisor, dividend;
3895
3896         int count_fls, nsec_fls, frequency_fls, sec_fls;
3897
3898         count_fls = fls64(count);
3899         nsec_fls = fls64(nsec);
3900         frequency_fls = fls64(frequency);
3901         sec_fls = 30;
3902
3903         /*
3904          * We got @count in @nsec, with a target of sample_freq HZ
3905          * the target period becomes:
3906          *
3907          *             @count * 10^9
3908          * period = -------------------
3909          *          @nsec * sample_freq
3910          *
3911          */
3912
3913         /*
3914          * Reduce accuracy by one bit such that @a and @b converge
3915          * to a similar magnitude.
3916          */
3917 #define REDUCE_FLS(a, b)                \
3918 do {                                    \
3919         if (a##_fls > b##_fls) {        \
3920                 a >>= 1;                \
3921                 a##_fls--;              \
3922         } else {                        \
3923                 b >>= 1;                \
3924                 b##_fls--;              \
3925         }                               \
3926 } while (0)
3927
3928         /*
3929          * Reduce accuracy until either term fits in a u64, then proceed with
3930          * the other, so that finally we can do a u64/u64 division.
3931          */
3932         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3933                 REDUCE_FLS(nsec, frequency);
3934                 REDUCE_FLS(sec, count);
3935         }
3936
3937         if (count_fls + sec_fls > 64) {
3938                 divisor = nsec * frequency;
3939
3940                 while (count_fls + sec_fls > 64) {
3941                         REDUCE_FLS(count, sec);
3942                         divisor >>= 1;
3943                 }
3944
3945                 dividend = count * sec;
3946         } else {
3947                 dividend = count * sec;
3948
3949                 while (nsec_fls + frequency_fls > 64) {
3950                         REDUCE_FLS(nsec, frequency);
3951                         dividend >>= 1;
3952                 }
3953
3954                 divisor = nsec * frequency;
3955         }
3956
3957         if (!divisor)
3958                 return dividend;
3959
3960         return div64_u64(dividend, divisor);
3961 }
3962
3963 static DEFINE_PER_CPU(int, perf_throttled_count);
3964 static DEFINE_PER_CPU(u64, perf_throttled_seq);
3965
3966 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3967 {
3968         struct hw_perf_event *hwc = &event->hw;
3969         s64 period, sample_period;
3970         s64 delta;
3971
3972         period = perf_calculate_period(event, nsec, count);
3973
3974         delta = (s64)(period - hwc->sample_period);
3975         delta = (delta + 7) / 8; /* low pass filter */
3976
3977         sample_period = hwc->sample_period + delta;
3978
3979         if (!sample_period)
3980                 sample_period = 1;
3981
3982         hwc->sample_period = sample_period;
3983
3984         if (local64_read(&hwc->period_left) > 8*sample_period) {
3985                 if (disable)
3986                         event->pmu->stop(event, PERF_EF_UPDATE);
3987
3988                 local64_set(&hwc->period_left, 0);
3989
3990                 if (disable)
3991                         event->pmu->start(event, PERF_EF_RELOAD);
3992         }
3993 }
3994
3995 /*
3996  * combine freq adjustment with unthrottling to avoid two passes over the
3997  * events. At the same time, make sure, having freq events does not change
3998  * the rate of unthrottling as that would introduce bias.
3999  */
4000 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
4001                                            int needs_unthr)
4002 {
4003         struct perf_event *event;
4004         struct hw_perf_event *hwc;
4005         u64 now, period = TICK_NSEC;
4006         s64 delta;
4007
4008         /*
4009          * only need to iterate over all events iff:
4010          * - context have events in frequency mode (needs freq adjust)
4011          * - there are events to unthrottle on this cpu
4012          */
4013         if (!(ctx->nr_freq || needs_unthr))
4014                 return;
4015
4016         raw_spin_lock(&ctx->lock);
4017         perf_pmu_disable(ctx->pmu);
4018
4019         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4020                 if (event->state != PERF_EVENT_STATE_ACTIVE)
4021                         continue;
4022
4023                 if (!event_filter_match(event))
4024                         continue;
4025
4026                 perf_pmu_disable(event->pmu);
4027
4028                 hwc = &event->hw;
4029
4030                 if (hwc->interrupts == MAX_INTERRUPTS) {
4031                         hwc->interrupts = 0;
4032                         perf_log_throttle(event, 1);
4033                         event->pmu->start(event, 0);
4034                 }
4035
4036                 if (!event->attr.freq || !event->attr.sample_freq)
4037                         goto next;
4038
4039                 /*
4040                  * stop the event and update event->count
4041                  */
4042                 event->pmu->stop(event, PERF_EF_UPDATE);
4043
4044                 now = local64_read(&event->count);
4045                 delta = now - hwc->freq_count_stamp;
4046                 hwc->freq_count_stamp = now;
4047
4048                 /*
4049                  * restart the event
4050                  * reload only if value has changed
4051                  * we have stopped the event so tell that
4052                  * to perf_adjust_period() to avoid stopping it
4053                  * twice.
4054                  */
4055                 if (delta > 0)
4056                         perf_adjust_period(event, period, delta, false);
4057
4058                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
4059         next:
4060                 perf_pmu_enable(event->pmu);
4061         }
4062
4063         perf_pmu_enable(ctx->pmu);
4064         raw_spin_unlock(&ctx->lock);
4065 }
4066
4067 /*
4068  * Move @event to the tail of the @ctx's elegible events.
4069  */
4070 static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
4071 {
4072         /*
4073          * Rotate the first entry last of non-pinned groups. Rotation might be
4074          * disabled by the inheritance code.
4075          */
4076         if (ctx->rotate_disable)
4077                 return;
4078
4079         perf_event_groups_delete(&ctx->flexible_groups, event);
4080         perf_event_groups_insert(&ctx->flexible_groups, event);
4081 }
4082
4083 /* pick an event from the flexible_groups to rotate */
4084 static inline struct perf_event *
4085 ctx_event_to_rotate(struct perf_event_context *ctx)
4086 {
4087         struct perf_event *event;
4088
4089         /* pick the first active flexible event */
4090         event = list_first_entry_or_null(&ctx->flexible_active,
4091                                          struct perf_event, active_list);
4092
4093         /* if no active flexible event, pick the first event */
4094         if (!event) {
4095                 event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
4096                                       typeof(*event), group_node);
4097         }
4098
4099         /*
4100          * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
4101          * finds there are unschedulable events, it will set it again.
4102          */
4103         ctx->rotate_necessary = 0;
4104
4105         return event;
4106 }
4107
4108 static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
4109 {
4110         struct perf_event *cpu_event = NULL, *task_event = NULL;
4111         struct perf_event_context *task_ctx = NULL;
4112         int cpu_rotate, task_rotate;
4113
4114         /*
4115          * Since we run this from IRQ context, nobody can install new
4116          * events, thus the event count values are stable.
4117          */
4118
4119         cpu_rotate = cpuctx->ctx.rotate_necessary;
4120         task_ctx = cpuctx->task_ctx;
4121         task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
4122
4123         if (!(cpu_rotate || task_rotate))
4124                 return false;
4125
4126         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
4127         perf_pmu_disable(cpuctx->ctx.pmu);
4128
4129         if (task_rotate)
4130                 task_event = ctx_event_to_rotate(task_ctx);
4131         if (cpu_rotate)
4132                 cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
4133
4134         /*
4135          * As per the order given at ctx_resched() first 'pop' task flexible
4136          * and then, if needed CPU flexible.
4137          */
4138         if (task_event || (task_ctx && cpu_event))
4139                 ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
4140         if (cpu_event)
4141                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
4142
4143         if (task_event)
4144                 rotate_ctx(task_ctx, task_event);
4145         if (cpu_event)
4146                 rotate_ctx(&cpuctx->ctx, cpu_event);
4147
4148         perf_event_sched_in(cpuctx, task_ctx, current);
4149
4150         perf_pmu_enable(cpuctx->ctx.pmu);
4151         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
4152
4153         return true;
4154 }
4155
4156 void perf_event_task_tick(void)
4157 {
4158         struct list_head *head = this_cpu_ptr(&active_ctx_list);
4159         struct perf_event_context *ctx, *tmp;
4160         int throttled;
4161
4162         lockdep_assert_irqs_disabled();
4163
4164         __this_cpu_inc(perf_throttled_seq);
4165         throttled = __this_cpu_xchg(perf_throttled_count, 0);
4166         tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
4167
4168         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
4169                 perf_adjust_freq_unthr_context(ctx, throttled);
4170 }
4171
4172 static int event_enable_on_exec(struct perf_event *event,
4173                                 struct perf_event_context *ctx)
4174 {
4175         if (!event->attr.enable_on_exec)
4176                 return 0;
4177
4178         event->attr.enable_on_exec = 0;
4179         if (event->state >= PERF_EVENT_STATE_INACTIVE)
4180                 return 0;
4181
4182         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
4183
4184         return 1;
4185 }
4186
4187 /*
4188  * Enable all of a task's events that have been marked enable-on-exec.
4189  * This expects task == current.
4190  */
4191 static void perf_event_enable_on_exec(int ctxn)
4192 {
4193         struct perf_event_context *ctx, *clone_ctx = NULL;
4194         enum event_type_t event_type = 0;
4195         struct perf_cpu_context *cpuctx;
4196         struct perf_event *event;
4197         unsigned long flags;
4198         int enabled = 0;
4199
4200         local_irq_save(flags);
4201         ctx = current->perf_event_ctxp[ctxn];
4202         if (!ctx || !ctx->nr_events)
4203                 goto out;
4204
4205         cpuctx = __get_cpu_context(ctx);
4206         perf_ctx_lock(cpuctx, ctx);
4207         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
4208         list_for_each_entry(event, &ctx->event_list, event_entry) {
4209                 enabled |= event_enable_on_exec(event, ctx);
4210                 event_type |= get_event_type(event);
4211         }
4212
4213         /*
4214          * Unclone and reschedule this context if we enabled any event.
4215          */
4216         if (enabled) {
4217                 clone_ctx = unclone_ctx(ctx);
4218                 ctx_resched(cpuctx, ctx, event_type);
4219         } else {
4220                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
4221         }
4222         perf_ctx_unlock(cpuctx, ctx);
4223
4224 out:
4225         local_irq_restore(flags);
4226
4227         if (clone_ctx)
4228                 put_ctx(clone_ctx);
4229 }
4230
4231 struct perf_read_data {
4232         struct perf_event *event;
4233         bool group;
4234         int ret;
4235 };
4236
4237 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
4238 {
4239         u16 local_pkg, event_pkg;
4240
4241         if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
4242                 int local_cpu = smp_processor_id();
4243
4244                 event_pkg = topology_physical_package_id(event_cpu);
4245                 local_pkg = topology_physical_package_id(local_cpu);
4246
4247                 if (event_pkg == local_pkg)
4248                         return local_cpu;
4249         }
4250
4251         return event_cpu;
4252 }
4253
4254 /*
4255  * Cross CPU call to read the hardware event
4256  */
4257 static void __perf_event_read(void *info)
4258 {
4259         struct perf_read_data *data = info;
4260         struct perf_event *sub, *event = data->event;
4261         struct perf_event_context *ctx = event->ctx;
4262         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
4263         struct pmu *pmu = event->pmu;
4264
4265         /*
4266          * If this is a task context, we need to check whether it is
4267          * the current task context of this cpu.  If not it has been
4268          * scheduled out before the smp call arrived.  In that case
4269          * event->count would have been updated to a recent sample
4270          * when the event was scheduled out.
4271          */
4272         if (ctx->task && cpuctx->task_ctx != ctx)
4273                 return;
4274
4275         raw_spin_lock(&ctx->lock);
4276         if (ctx->is_active & EVENT_TIME) {
4277                 update_context_time(ctx);
4278                 update_cgrp_time_from_event(event);
4279         }
4280
4281         perf_event_update_time(event);
4282         if (data->group)
4283                 perf_event_update_sibling_time(event);
4284
4285         if (event->state != PERF_EVENT_STATE_ACTIVE)
4286                 goto unlock;
4287
4288         if (!data->group) {
4289                 pmu->read(event);
4290                 data->ret = 0;
4291                 goto unlock;
4292         }
4293
4294         pmu->start_txn(pmu, PERF_PMU_TXN_READ);
4295
4296         pmu->read(event);
4297
4298         for_each_sibling_event(sub, event) {
4299                 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
4300                         /*
4301                          * Use sibling's PMU rather than @event's since
4302                          * sibling could be on different (eg: software) PMU.
4303                          */
4304                         sub->pmu->read(sub);
4305                 }
4306         }
4307
4308         data->ret = pmu->commit_txn(pmu);
4309
4310 unlock:
4311         raw_spin_unlock(&ctx->lock);
4312 }
4313
4314 static inline u64 perf_event_count(struct perf_event *event)
4315 {
4316         return local64_read(&event->count) + atomic64_read(&event->child_count);
4317 }
4318
4319 /*
4320  * NMI-safe method to read a local event, that is an event that
4321  * is:
4322  *   - either for the current task, or for this CPU
4323  *   - does not have inherit set, for inherited task events
4324  *     will not be local and we cannot read them atomically
4325  *   - must not have a pmu::count method
4326  */
4327 int perf_event_read_local(struct perf_event *event, u64 *value,
4328                           u64 *enabled, u64 *running)
4329 {
4330         unsigned long flags;
4331         int ret = 0;
4332
4333         /*
4334          * Disabling interrupts avoids all counter scheduling (context
4335          * switches, timer based rotation and IPIs).
4336          */
4337         local_irq_save(flags);
4338
4339         /*
4340          * It must not be an event with inherit set, we cannot read
4341          * all child counters from atomic context.
4342          */
4343         if (event->attr.inherit) {
4344                 ret = -EOPNOTSUPP;
4345                 goto out;
4346         }
4347
4348         /* If this is a per-task event, it must be for current */
4349         if ((event->attach_state & PERF_ATTACH_TASK) &&
4350             event->hw.target != current) {
4351                 ret = -EINVAL;
4352                 goto out;
4353         }
4354
4355         /* If this is a per-CPU event, it must be for this CPU */
4356         if (!(event->attach_state & PERF_ATTACH_TASK) &&
4357             event->cpu != smp_processor_id()) {
4358                 ret = -EINVAL;
4359                 goto out;
4360         }
4361
4362         /* If this is a pinned event it must be running on this CPU */
4363         if (event->attr.pinned && event->oncpu != smp_processor_id()) {
4364                 ret = -EBUSY;
4365                 goto out;
4366         }
4367
4368         /*
4369          * If the event is currently on this CPU, its either a per-task event,
4370          * or local to this CPU. Furthermore it means its ACTIVE (otherwise
4371          * oncpu == -1).
4372          */
4373         if (event->oncpu == smp_processor_id())
4374                 event->pmu->read(event);
4375
4376         *value = local64_read(&event->count);
4377         if (enabled || running) {
4378                 u64 now = event->shadow_ctx_time + perf_clock();
4379                 u64 __enabled, __running;
4380
4381                 __perf_update_times(event, now, &__enabled, &__running);
4382                 if (enabled)
4383                         *enabled = __enabled;
4384                 if (running)
4385                         *running = __running;
4386         }
4387 out:
4388         local_irq_restore(flags);
4389
4390         return ret;
4391 }
4392
4393 static int perf_event_read(struct perf_event *event, bool group)
4394 {
4395         enum perf_event_state state = READ_ONCE(event->state);
4396         int event_cpu, ret = 0;
4397
4398         /*
4399          * If event is enabled and currently active on a CPU, update the
4400          * value in the event structure:
4401          */
4402 again:
4403         if (state == PERF_EVENT_STATE_ACTIVE) {
4404                 struct perf_read_data data;
4405
4406                 /*
4407                  * Orders the ->state and ->oncpu loads such that if we see
4408                  * ACTIVE we must also see the right ->oncpu.
4409                  *
4410                  * Matches the smp_wmb() from event_sched_in().
4411                  */
4412                 smp_rmb();
4413
4414                 event_cpu = READ_ONCE(event->oncpu);
4415                 if ((unsigned)event_cpu >= nr_cpu_ids)
4416                         return 0;
4417
4418                 data = (struct perf_read_data){
4419                         .event = event,
4420                         .group = group,
4421                         .ret = 0,
4422                 };
4423
4424                 preempt_disable();
4425                 event_cpu = __perf_event_read_cpu(event, event_cpu);
4426
4427                 /*
4428                  * Purposely ignore the smp_call_function_single() return
4429                  * value.
4430                  *
4431                  * If event_cpu isn't a valid CPU it means the event got
4432                  * scheduled out and that will have updated the event count.
4433                  *
4434                  * Therefore, either way, we'll have an up-to-date event count
4435                  * after this.
4436                  */
4437                 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4438                 preempt_enable();
4439                 ret = data.ret;
4440
4441         } else if (state == PERF_EVENT_STATE_INACTIVE) {
4442                 struct perf_event_context *ctx = event->ctx;
4443                 unsigned long flags;
4444
4445                 raw_spin_lock_irqsave(&ctx->lock, flags);
4446                 state = event->state;
4447                 if (state != PERF_EVENT_STATE_INACTIVE) {
4448                         raw_spin_unlock_irqrestore(&ctx->lock, flags);
4449                         goto again;
4450                 }
4451
4452                 /*
4453                  * May read while context is not active (e.g., thread is
4454                  * blocked), in that case we cannot update context time
4455                  */
4456                 if (ctx->is_active & EVENT_TIME) {
4457                         update_context_time(ctx);
4458                         update_cgrp_time_from_event(event);
4459                 }
4460
4461                 perf_event_update_time(event);
4462                 if (group)
4463                         perf_event_update_sibling_time(event);
4464                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4465         }
4466
4467         return ret;
4468 }
4469
4470 /*
4471  * Initialize the perf_event context in a task_struct:
4472  */
4473 static void __perf_event_init_context(struct perf_event_context *ctx)
4474 {
4475         raw_spin_lock_init(&ctx->lock);
4476         mutex_init(&ctx->mutex);
4477         INIT_LIST_HEAD(&ctx->active_ctx_list);
4478         perf_event_groups_init(&ctx->pinned_groups);
4479         perf_event_groups_init(&ctx->flexible_groups);
4480         INIT_LIST_HEAD(&ctx->event_list);
4481         INIT_LIST_HEAD(&ctx->pinned_active);
4482         INIT_LIST_HEAD(&ctx->flexible_active);
4483         refcount_set(&ctx->refcount, 1);
4484 }
4485
4486 static struct perf_event_context *
4487 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4488 {
4489         struct perf_event_context *ctx;
4490
4491         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4492         if (!ctx)
4493                 return NULL;
4494
4495         __perf_event_init_context(ctx);
4496         if (task)
4497                 ctx->task = get_task_struct(task);
4498         ctx->pmu = pmu;
4499
4500         return ctx;
4501 }
4502
4503 static struct task_struct *
4504 find_lively_task_by_vpid(pid_t vpid)
4505 {
4506         struct task_struct *task;
4507
4508         rcu_read_lock();
4509         if (!vpid)
4510                 task = current;
4511         else
4512                 task = find_task_by_vpid(vpid);
4513         if (task)
4514                 get_task_struct(task);
4515         rcu_read_unlock();
4516
4517         if (!task)
4518                 return ERR_PTR(-ESRCH);
4519
4520         return task;
4521 }
4522
4523 /*
4524  * Returns a matching context with refcount and pincount.
4525  */
4526 static struct perf_event_context *
4527 find_get_context(struct pmu *pmu, struct task_struct *task,
4528                 struct perf_event *event)
4529 {
4530         struct perf_event_context *ctx, *clone_ctx = NULL;
4531         struct perf_cpu_context *cpuctx;
4532         void *task_ctx_data = NULL;
4533         unsigned long flags;
4534         int ctxn, err;
4535         int cpu = event->cpu;
4536
4537         if (!task) {
4538                 /* Must be root to operate on a CPU event: */
4539                 err = perf_allow_cpu(&event->attr);
4540                 if (err)
4541                         return ERR_PTR(err);
4542
4543                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
4544                 ctx = &cpuctx->ctx;
4545                 get_ctx(ctx);
4546                 ++ctx->pin_count;
4547
4548                 return ctx;
4549         }
4550
4551         err = -EINVAL;
4552         ctxn = pmu->task_ctx_nr;
4553         if (ctxn < 0)
4554                 goto errout;
4555
4556         if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4557                 task_ctx_data = alloc_task_ctx_data(pmu);
4558                 if (!task_ctx_data) {
4559                         err = -ENOMEM;
4560                         goto errout;
4561                 }
4562         }
4563
4564 retry:
4565         ctx = perf_lock_task_context(task, ctxn, &flags);
4566         if (ctx) {
4567                 clone_ctx = unclone_ctx(ctx);
4568                 ++ctx->pin_count;
4569
4570                 if (task_ctx_data && !ctx->task_ctx_data) {
4571                         ctx->task_ctx_data = task_ctx_data;
4572                         task_ctx_data = NULL;
4573                 }
4574                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4575
4576                 if (clone_ctx)
4577                         put_ctx(clone_ctx);
4578         } else {
4579                 ctx = alloc_perf_context(pmu, task);
4580                 err = -ENOMEM;
4581                 if (!ctx)
4582                         goto errout;
4583
4584                 if (task_ctx_data) {
4585                         ctx->task_ctx_data = task_ctx_data;
4586                         task_ctx_data = NULL;
4587                 }
4588
4589                 err = 0;
4590                 mutex_lock(&task->perf_event_mutex);
4591                 /*
4592                  * If it has already passed perf_event_exit_task().
4593                  * we must see PF_EXITING, it takes this mutex too.
4594                  */
4595                 if (task->flags & PF_EXITING)
4596                         err = -ESRCH;
4597                 else if (task->perf_event_ctxp[ctxn])
4598                         err = -EAGAIN;
4599                 else {
4600                         get_ctx(ctx);
4601                         ++ctx->pin_count;
4602                         rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
4603                 }
4604                 mutex_unlock(&task->perf_event_mutex);
4605
4606                 if (unlikely(err)) {
4607                         put_ctx(ctx);
4608
4609                         if (err == -EAGAIN)
4610                                 goto retry;
4611                         goto errout;
4612                 }
4613         }
4614
4615         free_task_ctx_data(pmu, task_ctx_data);
4616         return ctx;
4617
4618 errout:
4619         free_task_ctx_data(pmu, task_ctx_data);
4620         return ERR_PTR(err);
4621 }
4622
4623 static void perf_event_free_filter(struct perf_event *event);
4624 static void perf_event_free_bpf_prog(struct perf_event *event);
4625
4626 static void free_event_rcu(struct rcu_head *head)
4627 {
4628         struct perf_event *event;
4629
4630         event = container_of(head, struct perf_event, rcu_head);
4631         if (event->ns)
4632                 put_pid_ns(event->ns);
4633         perf_event_free_filter(event);
4634         kmem_cache_free(perf_event_cache, event);
4635 }
4636
4637 static void ring_buffer_attach(struct perf_event *event,
4638                                struct perf_buffer *rb);
4639
4640 static void detach_sb_event(struct perf_event *event)
4641 {
4642         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4643
4644         raw_spin_lock(&pel->lock);
4645         list_del_rcu(&event->sb_list);
4646         raw_spin_unlock(&pel->lock);
4647 }
4648
4649 static bool is_sb_event(struct perf_event *event)
4650 {
4651         struct perf_event_attr *attr = &event->attr;
4652
4653         if (event->parent)
4654                 return false;
4655
4656         if (event->attach_state & PERF_ATTACH_TASK)
4657                 return false;
4658
4659         if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4660             attr->comm || attr->comm_exec ||
4661             attr->task || attr->ksymbol ||
4662             attr->context_switch || attr->text_poke ||
4663             attr->bpf_event)
4664                 return true;
4665         return false;
4666 }
4667
4668 static void unaccount_pmu_sb_event(struct perf_event *event)
4669 {
4670         if (is_sb_event(event))
4671                 detach_sb_event(event);
4672 }
4673
4674 static void unaccount_event_cpu(struct perf_event *event, int cpu)
4675 {
4676         if (event->parent)
4677                 return;
4678
4679         if (is_cgroup_event(event))
4680                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4681 }
4682
4683 #ifdef CONFIG_NO_HZ_FULL
4684 static DEFINE_SPINLOCK(nr_freq_lock);
4685 #endif
4686
4687 static void unaccount_freq_event_nohz(void)
4688 {
4689 #ifdef CONFIG_NO_HZ_FULL
4690         spin_lock(&nr_freq_lock);
4691         if (atomic_dec_and_test(&nr_freq_events))
4692                 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4693         spin_unlock(&nr_freq_lock);
4694 #endif
4695 }
4696
4697 static void unaccount_freq_event(void)
4698 {
4699         if (tick_nohz_full_enabled())
4700                 unaccount_freq_event_nohz();
4701         else
4702                 atomic_dec(&nr_freq_events);
4703 }
4704
4705 static void unaccount_event(struct perf_event *event)
4706 {
4707         bool dec = false;
4708
4709         if (event->parent)
4710                 return;
4711
4712         if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
4713                 dec = true;
4714         if (event->attr.mmap || event->attr.mmap_data)
4715                 atomic_dec(&nr_mmap_events);
4716         if (event->attr.build_id)
4717                 atomic_dec(&nr_build_id_events);
4718         if (event->attr.comm)
4719                 atomic_dec(&nr_comm_events);
4720         if (event->attr.namespaces)
4721                 atomic_dec(&nr_namespaces_events);
4722         if (event->attr.cgroup)
4723                 atomic_dec(&nr_cgroup_events);
4724         if (event->attr.task)
4725                 atomic_dec(&nr_task_events);
4726         if (event->attr.freq)
4727                 unaccount_freq_event();
4728         if (event->attr.context_switch) {
4729                 dec = true;
4730                 atomic_dec(&nr_switch_events);
4731         }
4732         if (is_cgroup_event(event))
4733                 dec = true;
4734         if (has_branch_stack(event))
4735                 dec = true;
4736         if (event->attr.ksymbol)
4737                 atomic_dec(&nr_ksymbol_events);
4738         if (event->attr.bpf_event)
4739                 atomic_dec(&nr_bpf_events);
4740         if (event->attr.text_poke)
4741                 atomic_dec(&nr_text_poke_events);
4742
4743         if (dec) {
4744                 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4745                         schedule_delayed_work(&perf_sched_work, HZ);
4746         }
4747
4748         unaccount_event_cpu(event, event->cpu);
4749
4750         unaccount_pmu_sb_event(event);
4751 }
4752
4753 static void perf_sched_delayed(struct work_struct *work)
4754 {
4755         mutex_lock(&perf_sched_mutex);
4756         if (atomic_dec_and_test(&perf_sched_count))
4757                 static_branch_disable(&perf_sched_events);
4758         mutex_unlock(&perf_sched_mutex);
4759 }
4760
4761 /*
4762  * The following implement mutual exclusion of events on "exclusive" pmus
4763  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
4764  * at a time, so we disallow creating events that might conflict, namely:
4765  *
4766  *  1) cpu-wide events in the presence of per-task events,
4767  *  2) per-task events in the presence of cpu-wide events,
4768  *  3) two matching events on the same context.
4769  *
4770  * The former two cases are handled in the allocation path (perf_event_alloc(),
4771  * _free_event()), the latter -- before the first perf_install_in_context().
4772  */
4773 static int exclusive_event_init(struct perf_event *event)
4774 {
4775         struct pmu *pmu = event->pmu;
4776
4777         if (!is_exclusive_pmu(pmu))
4778                 return 0;
4779
4780         /*
4781          * Prevent co-existence of per-task and cpu-wide events on the
4782          * same exclusive pmu.
4783          *
4784          * Negative pmu::exclusive_cnt means there are cpu-wide
4785          * events on this "exclusive" pmu, positive means there are
4786          * per-task events.
4787          *
4788          * Since this is called in perf_event_alloc() path, event::ctx
4789          * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
4790          * to mean "per-task event", because unlike other attach states it
4791          * never gets cleared.
4792          */
4793         if (event->attach_state & PERF_ATTACH_TASK) {
4794                 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4795                         return -EBUSY;
4796         } else {
4797                 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4798                         return -EBUSY;
4799         }
4800
4801         return 0;
4802 }
4803
4804 static void exclusive_event_destroy(struct perf_event *event)
4805 {
4806         struct pmu *pmu = event->pmu;
4807
4808         if (!is_exclusive_pmu(pmu))
4809                 return;
4810
4811         /* see comment in exclusive_event_init() */
4812         if (event->attach_state & PERF_ATTACH_TASK)
4813                 atomic_dec(&pmu->exclusive_cnt);
4814         else
4815                 atomic_inc(&pmu->exclusive_cnt);
4816 }
4817
4818 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4819 {
4820         if ((e1->pmu == e2->pmu) &&
4821             (e1->cpu == e2->cpu ||
4822              e1->cpu == -1 ||
4823              e2->cpu == -1))
4824                 return true;
4825         return false;
4826 }
4827
4828 static bool exclusive_event_installable(struct perf_event *event,
4829                                         struct perf_event_context *ctx)
4830 {
4831         struct perf_event *iter_event;
4832         struct pmu *pmu = event->pmu;
4833
4834         lockdep_assert_held(&ctx->mutex);
4835
4836         if (!is_exclusive_pmu(pmu))
4837                 return true;
4838
4839         list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4840                 if (exclusive_event_match(iter_event, event))
4841                         return false;
4842         }
4843
4844         return true;
4845 }
4846
4847 static void perf_addr_filters_splice(struct perf_event *event,
4848                                        struct list_head *head);
4849
4850 static void _free_event(struct perf_event *event)
4851 {
4852         irq_work_sync(&event->pending);
4853
4854         unaccount_event(event);
4855
4856         security_perf_event_free(event);
4857
4858         if (event->rb) {
4859                 /*
4860                  * Can happen when we close an event with re-directed output.
4861                  *
4862                  * Since we have a 0 refcount, perf_mmap_close() will skip
4863                  * over us; possibly making our ring_buffer_put() the last.
4864                  */
4865                 mutex_lock(&event->mmap_mutex);
4866                 ring_buffer_attach(event, NULL);
4867                 mutex_unlock(&event->mmap_mutex);
4868         }
4869
4870         if (is_cgroup_event(event))
4871                 perf_detach_cgroup(event);
4872
4873         if (!event->parent) {
4874                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4875                         put_callchain_buffers();
4876         }
4877
4878         perf_event_free_bpf_prog(event);
4879         perf_addr_filters_splice(event, NULL);
4880         kfree(event->addr_filter_ranges);
4881
4882         if (event->destroy)
4883                 event->destroy(event);
4884
4885         /*
4886          * Must be after ->destroy(), due to uprobe_perf_close() using
4887          * hw.target.
4888          */
4889         if (event->hw.target)
4890                 put_task_struct(event->hw.target);
4891
4892         /*
4893          * perf_event_free_task() relies on put_ctx() being 'last', in particular
4894          * all task references must be cleaned up.
4895          */
4896         if (event->ctx)
4897                 put_ctx(event->ctx);
4898
4899         exclusive_event_destroy(event);
4900         module_put(event->pmu->module);
4901
4902         call_rcu(&event->rcu_head, free_event_rcu);
4903 }
4904
4905 /*
4906  * Used to free events which have a known refcount of 1, such as in error paths
4907  * where the event isn't exposed yet and inherited events.
4908  */
4909 static void free_event(struct perf_event *event)
4910 {
4911         if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4912                                 "unexpected event refcount: %ld; ptr=%p\n",
4913                                 atomic_long_read(&event->refcount), event)) {
4914                 /* leak to avoid use-after-free */
4915                 return;
4916         }
4917
4918         _free_event(event);
4919 }
4920
4921 /*
4922  * Remove user event from the owner task.
4923  */
4924 static void perf_remove_from_owner(struct perf_event *event)
4925 {
4926         struct task_struct *owner;
4927
4928         rcu_read_lock();
4929         /*
4930          * Matches the smp_store_release() in perf_event_exit_task(). If we
4931          * observe !owner it means the list deletion is complete and we can
4932          * indeed free this event, otherwise we need to serialize on
4933          * owner->perf_event_mutex.
4934          */
4935         owner = READ_ONCE(event->owner);
4936         if (owner) {
4937                 /*
4938                  * Since delayed_put_task_struct() also drops the last
4939                  * task reference we can safely take a new reference
4940                  * while holding the rcu_read_lock().
4941                  */
4942                 get_task_struct(owner);
4943         }
4944         rcu_read_unlock();
4945
4946         if (owner) {
4947                 /*
4948                  * If we're here through perf_event_exit_task() we're already
4949                  * holding ctx->mutex which would be an inversion wrt. the
4950                  * normal lock order.
4951                  *
4952                  * However we can safely take this lock because its the child
4953                  * ctx->mutex.
4954                  */
4955                 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4956
4957                 /*
4958                  * We have to re-check the event->owner field, if it is cleared
4959                  * we raced with perf_event_exit_task(), acquiring the mutex
4960                  * ensured they're done, and we can proceed with freeing the
4961                  * event.
4962                  */
4963                 if (event->owner) {
4964                         list_del_init(&event->owner_entry);
4965                         smp_store_release(&event->owner, NULL);
4966                 }
4967                 mutex_unlock(&owner->perf_event_mutex);
4968                 put_task_struct(owner);
4969         }
4970 }
4971
4972 static void put_event(struct perf_event *event)
4973 {
4974         if (!atomic_long_dec_and_test(&event->refcount))
4975                 return;
4976
4977         _free_event(event);
4978 }
4979
4980 /*
4981  * Kill an event dead; while event:refcount will preserve the event
4982  * object, it will not preserve its functionality. Once the last 'user'
4983  * gives up the object, we'll destroy the thing.
4984  */
4985 int perf_event_release_kernel(struct perf_event *event)
4986 {
4987         struct perf_event_context *ctx = event->ctx;
4988         struct perf_event *child, *tmp;
4989         LIST_HEAD(free_list);
4990
4991         /*
4992          * If we got here through err_file: fput(event_file); we will not have
4993          * attached to a context yet.
4994          */
4995         if (!ctx) {
4996                 WARN_ON_ONCE(event->attach_state &
4997                                 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4998                 goto no_ctx;
4999         }
5000
5001         if (!is_kernel_event(event))
5002                 perf_remove_from_owner(event);
5003
5004         ctx = perf_event_ctx_lock(event);
5005         WARN_ON_ONCE(ctx->parent_ctx);
5006         perf_remove_from_context(event, DETACH_GROUP);
5007
5008         raw_spin_lock_irq(&ctx->lock);
5009         /*
5010          * Mark this event as STATE_DEAD, there is no external reference to it
5011          * anymore.
5012          *
5013          * Anybody acquiring event->child_mutex after the below loop _must_
5014          * also see this, most importantly inherit_event() which will avoid
5015          * placing more children on the list.
5016          *
5017          * Thus this guarantees that we will in fact observe and kill _ALL_
5018          * child events.
5019          */
5020         event->state = PERF_EVENT_STATE_DEAD;
5021         raw_spin_unlock_irq(&ctx->lock);
5022
5023         perf_event_ctx_unlock(event, ctx);
5024
5025 again:
5026         mutex_lock(&event->child_mutex);
5027         list_for_each_entry(child, &event->child_list, child_list) {
5028
5029                 /*
5030                  * Cannot change, child events are not migrated, see the
5031                  * comment with perf_event_ctx_lock_nested().
5032                  */
5033                 ctx = READ_ONCE(child->ctx);
5034                 /*
5035                  * Since child_mutex nests inside ctx::mutex, we must jump
5036                  * through hoops. We start by grabbing a reference on the ctx.
5037                  *
5038                  * Since the event cannot get freed while we hold the
5039                  * child_mutex, the context must also exist and have a !0
5040                  * reference count.
5041                  */
5042                 get_ctx(ctx);
5043
5044                 /*
5045                  * Now that we have a ctx ref, we can drop child_mutex, and
5046                  * acquire ctx::mutex without fear of it going away. Then we
5047                  * can re-acquire child_mutex.
5048                  */
5049                 mutex_unlock(&event->child_mutex);
5050                 mutex_lock(&ctx->mutex);
5051                 mutex_lock(&event->child_mutex);
5052
5053                 /*
5054                  * Now that we hold ctx::mutex and child_mutex, revalidate our
5055                  * state, if child is still the first entry, it didn't get freed
5056                  * and we can continue doing so.
5057                  */
5058                 tmp = list_first_entry_or_null(&event->child_list,
5059                                                struct perf_event, child_list);
5060                 if (tmp == child) {
5061                         perf_remove_from_context(child, DETACH_GROUP);
5062                         list_move(&child->child_list, &free_list);
5063                         /*
5064                          * This matches the refcount bump in inherit_event();
5065                          * this can't be the last reference.
5066                          */
5067                         put_event(event);
5068                 }
5069
5070                 mutex_unlock(&event->child_mutex);
5071                 mutex_unlock(&ctx->mutex);
5072                 put_ctx(ctx);
5073                 goto again;
5074         }
5075         mutex_unlock(&event->child_mutex);
5076
5077         list_for_each_entry_safe(child, tmp, &free_list, child_list) {
5078                 void *var = &child->ctx->refcount;
5079
5080                 list_del(&child->child_list);
5081                 free_event(child);
5082
5083                 /*
5084                  * Wake any perf_event_free_task() waiting for this event to be
5085                  * freed.
5086                  */
5087                 smp_mb(); /* pairs with wait_var_event() */
5088                 wake_up_var(var);
5089         }
5090
5091 no_ctx:
5092         put_event(event); /* Must be the 'last' reference */
5093         return 0;
5094 }
5095 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
5096
5097 /*
5098  * Called when the last reference to the file is gone.
5099  */
5100 static int perf_release(struct inode *inode, struct file *file)
5101 {
5102         perf_event_release_kernel(file->private_data);
5103         return 0;
5104 }
5105
5106 static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5107 {
5108         struct perf_event *child;
5109         u64 total = 0;
5110
5111         *enabled = 0;
5112         *running = 0;
5113
5114         mutex_lock(&event->child_mutex);
5115
5116         (void)perf_event_read(event, false);
5117         total += perf_event_count(event);
5118
5119         *enabled += event->total_time_enabled +
5120                         atomic64_read(&event->child_total_time_enabled);
5121         *running += event->total_time_running +
5122                         atomic64_read(&event->child_total_time_running);
5123
5124         list_for_each_entry(child, &event->child_list, child_list) {
5125                 (void)perf_event_read(child, false);
5126                 total += perf_event_count(child);
5127                 *enabled += child->total_time_enabled;
5128                 *running += child->total_time_running;
5129         }
5130         mutex_unlock(&event->child_mutex);
5131
5132         return total;
5133 }
5134
5135 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
5136 {
5137         struct perf_event_context *ctx;
5138         u64 count;
5139
5140         ctx = perf_event_ctx_lock(event);
5141         count = __perf_event_read_value(event, enabled, running);
5142         perf_event_ctx_unlock(event, ctx);
5143
5144         return count;
5145 }
5146 EXPORT_SYMBOL_GPL(perf_event_read_value);
5147
5148 static int __perf_read_group_add(struct perf_event *leader,
5149                                         u64 read_format, u64 *values)
5150 {
5151         struct perf_event_context *ctx = leader->ctx;
5152         struct perf_event *sub;
5153         unsigned long flags;
5154         int n = 1; /* skip @nr */
5155         int ret;
5156
5157         ret = perf_event_read(leader, true);
5158         if (ret)
5159                 return ret;
5160
5161         raw_spin_lock_irqsave(&ctx->lock, flags);
5162
5163         /*
5164          * Since we co-schedule groups, {enabled,running} times of siblings
5165          * will be identical to those of the leader, so we only publish one
5166          * set.
5167          */
5168         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5169                 values[n++] += leader->total_time_enabled +
5170                         atomic64_read(&leader->child_total_time_enabled);
5171         }
5172
5173         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5174                 values[n++] += leader->total_time_running +
5175                         atomic64_read(&leader->child_total_time_running);
5176         }
5177
5178         /*
5179          * Write {count,id} tuples for every sibling.
5180          */
5181         values[n++] += perf_event_count(leader);
5182         if (read_format & PERF_FORMAT_ID)
5183                 values[n++] = primary_event_id(leader);
5184
5185         for_each_sibling_event(sub, leader) {
5186                 values[n++] += perf_event_count(sub);
5187                 if (read_format & PERF_FORMAT_ID)
5188                         values[n++] = primary_event_id(sub);
5189         }
5190
5191         raw_spin_unlock_irqrestore(&ctx->lock, flags);
5192         return 0;
5193 }
5194
5195 static int perf_read_group(struct perf_event *event,
5196                                    u64 read_format, char __user *buf)
5197 {
5198         struct perf_event *leader = event->group_leader, *child;
5199         struct perf_event_context *ctx = leader->ctx;
5200         int ret;
5201         u64 *values;
5202
5203         lockdep_assert_held(&ctx->mutex);
5204
5205         values = kzalloc(event->read_size, GFP_KERNEL);
5206         if (!values)
5207                 return -ENOMEM;
5208
5209         values[0] = 1 + leader->nr_siblings;
5210
5211         /*
5212          * By locking the child_mutex of the leader we effectively
5213          * lock the child list of all siblings.. XXX explain how.
5214          */
5215         mutex_lock(&leader->child_mutex);
5216
5217         ret = __perf_read_group_add(leader, read_format, values);
5218         if (ret)
5219                 goto unlock;
5220
5221         list_for_each_entry(child, &leader->child_list, child_list) {
5222                 ret = __perf_read_group_add(child, read_format, values);
5223                 if (ret)
5224                         goto unlock;
5225         }
5226
5227         mutex_unlock(&leader->child_mutex);
5228
5229         ret = event->read_size;
5230         if (copy_to_user(buf, values, event->read_size))
5231                 ret = -EFAULT;
5232         goto out;
5233
5234 unlock:
5235         mutex_unlock(&leader->child_mutex);
5236 out:
5237         kfree(values);
5238         return ret;
5239 }
5240
5241 static int perf_read_one(struct perf_event *event,
5242                                  u64 read_format, char __user *buf)
5243 {
5244         u64 enabled, running;
5245         u64 values[4];
5246         int n = 0;
5247
5248         values[n++] = __perf_event_read_value(event, &enabled, &running);
5249         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5250                 values[n++] = enabled;
5251         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5252                 values[n++] = running;
5253         if (read_format & PERF_FORMAT_ID)
5254                 values[n++] = primary_event_id(event);
5255
5256         if (copy_to_user(buf, values, n * sizeof(u64)))
5257                 return -EFAULT;
5258
5259         return n * sizeof(u64);
5260 }
5261
5262 static bool is_event_hup(struct perf_event *event)
5263 {
5264         bool no_children;
5265
5266         if (event->state > PERF_EVENT_STATE_EXIT)
5267                 return false;
5268
5269         mutex_lock(&event->child_mutex);
5270         no_children = list_empty(&event->child_list);
5271         mutex_unlock(&event->child_mutex);
5272         return no_children;
5273 }
5274
5275 /*
5276  * Read the performance event - simple non blocking version for now
5277  */
5278 static ssize_t
5279 __perf_read(struct perf_event *event, char __user *buf, size_t count)
5280 {
5281         u64 read_format = event->attr.read_format;
5282         int ret;
5283
5284         /*
5285          * Return end-of-file for a read on an event that is in
5286          * error state (i.e. because it was pinned but it couldn't be
5287          * scheduled on to the CPU at some point).
5288          */
5289         if (event->state == PERF_EVENT_STATE_ERROR)
5290                 return 0;
5291
5292         if (count < event->read_size)
5293                 return -ENOSPC;
5294
5295         WARN_ON_ONCE(event->ctx->parent_ctx);
5296         if (read_format & PERF_FORMAT_GROUP)
5297                 ret = perf_read_group(event, read_format, buf);
5298         else
5299                 ret = perf_read_one(event, read_format, buf);
5300
5301         return ret;
5302 }
5303
5304 static ssize_t
5305 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
5306 {
5307         struct perf_event *event = file->private_data;
5308         struct perf_event_context *ctx;
5309         int ret;
5310
5311         ret = security_perf_event_read(event);
5312         if (ret)
5313                 return ret;
5314
5315         ctx = perf_event_ctx_lock(event);
5316         ret = __perf_read(event, buf, count);
5317         perf_event_ctx_unlock(event, ctx);
5318
5319         return ret;
5320 }
5321
5322 static __poll_t perf_poll(struct file *file, poll_table *wait)
5323 {
5324         struct perf_event *event = file->private_data;
5325         struct perf_buffer *rb;
5326         __poll_t events = EPOLLHUP;
5327
5328         poll_wait(file, &event->waitq, wait);
5329
5330         if (is_event_hup(event))
5331                 return events;
5332
5333         /*
5334          * Pin the event->rb by taking event->mmap_mutex; otherwise
5335          * perf_event_set_output() can swizzle our rb and make us miss wakeups.
5336          */
5337         mutex_lock(&event->mmap_mutex);
5338         rb = event->rb;
5339         if (rb)
5340                 events = atomic_xchg(&rb->poll, 0);
5341         mutex_unlock(&event->mmap_mutex);
5342         return events;
5343 }
5344
5345 static void _perf_event_reset(struct perf_event *event)
5346 {
5347         (void)perf_event_read(event, false);
5348         local64_set(&event->count, 0);
5349         perf_event_update_userpage(event);
5350 }
5351
5352 /* Assume it's not an event with inherit set. */
5353 u64 perf_event_pause(struct perf_event *event, bool reset)
5354 {
5355         struct perf_event_context *ctx;
5356         u64 count;
5357
5358         ctx = perf_event_ctx_lock(event);
5359         WARN_ON_ONCE(event->attr.inherit);
5360         _perf_event_disable(event);
5361         count = local64_read(&event->count);
5362         if (reset)
5363                 local64_set(&event->count, 0);
5364         perf_event_ctx_unlock(event, ctx);
5365
5366         return count;
5367 }
5368 EXPORT_SYMBOL_GPL(perf_event_pause);
5369
5370 /*
5371  * Holding the top-level event's child_mutex means that any
5372  * descendant process that has inherited this event will block
5373  * in perf_event_exit_event() if it goes to exit, thus satisfying the
5374  * task existence requirements of perf_event_enable/disable.
5375  */
5376 static void perf_event_for_each_child(struct perf_event *event,
5377                                         void (*func)(struct perf_event *))
5378 {
5379         struct perf_event *child;
5380
5381         WARN_ON_ONCE(event->ctx->parent_ctx);
5382
5383         mutex_lock(&event->child_mutex);
5384         func(event);
5385         list_for_each_entry(child, &event->child_list, child_list)
5386                 func(child);
5387         mutex_unlock(&event->child_mutex);
5388 }
5389
5390 static void perf_event_for_each(struct perf_event *event,
5391                                   void (*func)(struct perf_event *))
5392 {
5393         struct perf_event_context *ctx = event->ctx;
5394         struct perf_event *sibling;
5395
5396         lockdep_assert_held(&ctx->mutex);
5397
5398         event = event->group_leader;
5399
5400         perf_event_for_each_child(event, func);
5401         for_each_sibling_event(sibling, event)
5402                 perf_event_for_each_child(sibling, func);
5403 }
5404
5405 static void __perf_event_period(struct perf_event *event,
5406                                 struct perf_cpu_context *cpuctx,
5407                                 struct perf_event_context *ctx,
5408                                 void *info)
5409 {
5410         u64 value = *((u64 *)info);
5411         bool active;
5412
5413         if (event->attr.freq) {
5414                 event->attr.sample_freq = value;
5415         } else {
5416                 event->attr.sample_period = value;
5417                 event->hw.sample_period = value;
5418         }
5419
5420         active = (event->state == PERF_EVENT_STATE_ACTIVE);
5421         if (active) {
5422                 perf_pmu_disable(ctx->pmu);
5423                 /*
5424                  * We could be throttled; unthrottle now to avoid the tick
5425                  * trying to unthrottle while we already re-started the event.
5426                  */
5427                 if (event->hw.interrupts == MAX_INTERRUPTS) {
5428                         event->hw.interrupts = 0;
5429                         perf_log_throttle(event, 1);
5430                 }
5431                 event->pmu->stop(event, PERF_EF_UPDATE);
5432         }
5433
5434         local64_set(&event->hw.period_left, 0);
5435
5436         if (active) {
5437                 event->pmu->start(event, PERF_EF_RELOAD);
5438                 perf_pmu_enable(ctx->pmu);
5439         }
5440 }
5441
5442 static int perf_event_check_period(struct perf_event *event, u64 value)
5443 {
5444         return event->pmu->check_period(event, value);
5445 }
5446
5447 static int _perf_event_period(struct perf_event *event, u64 value)
5448 {
5449         if (!is_sampling_event(event))
5450                 return -EINVAL;
5451
5452         if (!value)
5453                 return -EINVAL;
5454
5455         if (event->attr.freq && value > sysctl_perf_event_sample_rate)
5456                 return -EINVAL;
5457
5458         if (perf_event_check_period(event, value))
5459                 return -EINVAL;
5460
5461         if (!event->attr.freq && (value & (1ULL << 63)))
5462                 return -EINVAL;
5463
5464         event_function_call(event, __perf_event_period, &value);
5465
5466         return 0;
5467 }
5468
5469 int perf_event_period(struct perf_event *event, u64 value)
5470 {
5471         struct perf_event_context *ctx;
5472         int ret;
5473
5474         ctx = perf_event_ctx_lock(event);
5475         ret = _perf_event_period(event, value);
5476         perf_event_ctx_unlock(event, ctx);
5477
5478         return ret;
5479 }
5480 EXPORT_SYMBOL_GPL(perf_event_period);
5481
5482 static const struct file_operations perf_fops;
5483
5484 static inline int perf_fget_light(int fd, struct fd *p)
5485 {
5486         struct fd f = fdget(fd);
5487         if (!f.file)
5488                 return -EBADF;
5489
5490         if (f.file->f_op != &perf_fops) {
5491                 fdput(f);
5492                 return -EBADF;
5493         }
5494         *p = f;
5495         return 0;
5496 }
5497
5498 static int perf_event_set_output(struct perf_event *event,
5499                                  struct perf_event *output_event);
5500 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5501 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
5502 static int perf_copy_attr(struct perf_event_attr __user *uattr,
5503                           struct perf_event_attr *attr);
5504
5505 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5506 {
5507         void (*func)(struct perf_event *);
5508         u32 flags = arg;
5509
5510         switch (cmd) {
5511         case PERF_EVENT_IOC_ENABLE:
5512                 func = _perf_event_enable;
5513                 break;
5514         case PERF_EVENT_IOC_DISABLE:
5515                 func = _perf_event_disable;
5516                 break;
5517         case PERF_EVENT_IOC_RESET:
5518                 func = _perf_event_reset;
5519                 break;
5520
5521         case PERF_EVENT_IOC_REFRESH:
5522                 return _perf_event_refresh(event, arg);
5523
5524         case PERF_EVENT_IOC_PERIOD:
5525         {
5526                 u64 value;
5527
5528                 if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
5529                         return -EFAULT;
5530
5531                 return _perf_event_period(event, value);
5532         }
5533         case PERF_EVENT_IOC_ID:
5534         {
5535                 u64 id = primary_event_id(event);
5536
5537                 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5538                         return -EFAULT;
5539                 return 0;
5540         }
5541
5542         case PERF_EVENT_IOC_SET_OUTPUT:
5543         {
5544                 int ret;
5545                 if (arg != -1) {
5546                         struct perf_event *output_event;
5547                         struct fd output;
5548                         ret = perf_fget_light(arg, &output);
5549                         if (ret)
5550                                 return ret;
5551                         output_event = output.file->private_data;
5552                         ret = perf_event_set_output(event, output_event);
5553                         fdput(output);
5554                 } else {
5555                         ret = perf_event_set_output(event, NULL);
5556                 }
5557                 return ret;
5558         }
5559
5560         case PERF_EVENT_IOC_SET_FILTER:
5561                 return perf_event_set_filter(event, (void __user *)arg);
5562
5563         case PERF_EVENT_IOC_SET_BPF:
5564                 return perf_event_set_bpf_prog(event, arg);
5565
5566         case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5567                 struct perf_buffer *rb;
5568
5569                 rcu_read_lock();
5570                 rb = rcu_dereference(event->rb);
5571                 if (!rb || !rb->nr_pages) {
5572                         rcu_read_unlock();
5573                         return -EINVAL;
5574                 }
5575                 rb_toggle_paused(rb, !!arg);
5576                 rcu_read_unlock();
5577                 return 0;
5578         }
5579
5580         case PERF_EVENT_IOC_QUERY_BPF:
5581                 return perf_event_query_prog_array(event, (void __user *)arg);
5582
5583         case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5584                 struct perf_event_attr new_attr;
5585                 int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5586                                          &new_attr);
5587
5588                 if (err)
5589                         return err;
5590
5591                 return perf_event_modify_attr(event,  &new_attr);
5592         }
5593         default:
5594                 return -ENOTTY;
5595         }
5596
5597         if (flags & PERF_IOC_FLAG_GROUP)
5598                 perf_event_for_each(event, func);
5599         else
5600                 perf_event_for_each_child(event, func);
5601
5602         return 0;
5603 }
5604
5605 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5606 {
5607         struct perf_event *event = file->private_data;
5608         struct perf_event_context *ctx;
5609         long ret;
5610
5611         /* Treat ioctl like writes as it is likely a mutating operation. */
5612         ret = security_perf_event_write(event);
5613         if (ret)
5614                 return ret;
5615
5616         ctx = perf_event_ctx_lock(event);
5617         ret = _perf_ioctl(event, cmd, arg);
5618         perf_event_ctx_unlock(event, ctx);
5619
5620         return ret;
5621 }
5622
5623 #ifdef CONFIG_COMPAT
5624 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5625                                 unsigned long arg)
5626 {
5627         switch (_IOC_NR(cmd)) {
5628         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5629         case _IOC_NR(PERF_EVENT_IOC_ID):
5630         case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
5631         case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
5632                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
5633                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5634                         cmd &= ~IOCSIZE_MASK;
5635                         cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5636                 }
5637                 break;
5638         }
5639         return perf_ioctl(file, cmd, arg);
5640 }
5641 #else
5642 # define perf_compat_ioctl NULL
5643 #endif
5644
5645 int perf_event_task_enable(void)
5646 {
5647         struct perf_event_context *ctx;
5648         struct perf_event *event;
5649
5650         mutex_lock(&current->perf_event_mutex);
5651         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5652                 ctx = perf_event_ctx_lock(event);
5653                 perf_event_for_each_child(event, _perf_event_enable);
5654                 perf_event_ctx_unlock(event, ctx);
5655         }
5656         mutex_unlock(&current->perf_event_mutex);
5657
5658         return 0;
5659 }
5660
5661 int perf_event_task_disable(void)
5662 {
5663         struct perf_event_context *ctx;
5664         struct perf_event *event;
5665
5666         mutex_lock(&current->perf_event_mutex);
5667         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5668                 ctx = perf_event_ctx_lock(event);
5669                 perf_event_for_each_child(event, _perf_event_disable);
5670                 perf_event_ctx_unlock(event, ctx);
5671         }
5672         mutex_unlock(&current->perf_event_mutex);
5673
5674         return 0;
5675 }
5676
5677 static int perf_event_index(struct perf_event *event)
5678 {
5679         if (event->hw.state & PERF_HES_STOPPED)
5680                 return 0;
5681
5682         if (event->state != PERF_EVENT_STATE_ACTIVE)
5683                 return 0;
5684
5685         return event->pmu->event_idx(event);
5686 }
5687
5688 static void calc_timer_values(struct perf_event *event,
5689                                 u64 *now,
5690                                 u64 *enabled,
5691                                 u64 *running)
5692 {
5693         u64 ctx_time;
5694
5695         *now = perf_clock();
5696         ctx_time = event->shadow_ctx_time + *now;
5697         __perf_update_times(event, ctx_time, enabled, running);
5698 }
5699
5700 static void perf_event_init_userpage(struct perf_event *event)
5701 {
5702         struct perf_event_mmap_page *userpg;
5703         struct perf_buffer *rb;
5704
5705         rcu_read_lock();
5706         rb = rcu_dereference(event->rb);
5707         if (!rb)
5708                 goto unlock;
5709
5710         userpg = rb->user_page;
5711
5712         /* Allow new userspace to detect that bit 0 is deprecated */
5713         userpg->cap_bit0_is_deprecated = 1;
5714         userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
5715         userpg->data_offset = PAGE_SIZE;
5716         userpg->data_size = perf_data_size(rb);
5717
5718 unlock:
5719         rcu_read_unlock();
5720 }
5721
5722 void __weak arch_perf_update_userpage(
5723         struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5724 {
5725 }
5726
5727 /*
5728  * Callers need to ensure there can be no nesting of this function, otherwise
5729  * the seqlock logic goes bad. We can not serialize this because the arch
5730  * code calls this from NMI context.
5731  */
5732 void perf_event_update_userpage(struct perf_event *event)
5733 {
5734         struct perf_event_mmap_page *userpg;
5735         struct perf_buffer *rb;
5736         u64 enabled, running, now;
5737
5738         rcu_read_lock();
5739         rb = rcu_dereference(event->rb);
5740         if (!rb)
5741                 goto unlock;
5742
5743         /*
5744          * compute total_time_enabled, total_time_running
5745          * based on snapshot values taken when the event
5746          * was last scheduled in.
5747          *
5748          * we cannot simply called update_context_time()
5749          * because of locking issue as we can be called in
5750          * NMI context
5751          */
5752         calc_timer_values(event, &now, &enabled, &running);
5753
5754         userpg = rb->user_page;
5755         /*
5756          * Disable preemption to guarantee consistent time stamps are stored to
5757          * the user page.
5758          */
5759         preempt_disable();
5760         ++userpg->lock;
5761         barrier();
5762         userpg->index = perf_event_index(event);
5763         userpg->offset = perf_event_count(event);
5764         if (userpg->index)
5765                 userpg->offset -= local64_read(&event->hw.prev_count);
5766
5767         userpg->time_enabled = enabled +
5768                         atomic64_read(&event->child_total_time_enabled);
5769
5770         userpg->time_running = running +
5771                         atomic64_read(&event->child_total_time_running);
5772
5773         arch_perf_update_userpage(event, userpg, now);
5774
5775         barrier();
5776         ++userpg->lock;
5777         preempt_enable();
5778 unlock:
5779         rcu_read_unlock();
5780 }
5781 EXPORT_SYMBOL_GPL(perf_event_update_userpage);
5782
5783 static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5784 {
5785         struct perf_event *event = vmf->vma->vm_file->private_data;
5786         struct perf_buffer *rb;
5787         vm_fault_t ret = VM_FAULT_SIGBUS;
5788
5789         if (vmf->flags & FAULT_FLAG_MKWRITE) {
5790                 if (vmf->pgoff == 0)
5791                         ret = 0;
5792                 return ret;
5793         }
5794
5795         rcu_read_lock();
5796         rb = rcu_dereference(event->rb);
5797         if (!rb)
5798                 goto unlock;
5799
5800         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5801                 goto unlock;
5802
5803         vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5804         if (!vmf->page)
5805                 goto unlock;
5806
5807         get_page(vmf->page);
5808         vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5809         vmf->page->index   = vmf->pgoff;
5810
5811         ret = 0;
5812 unlock:
5813         rcu_read_unlock();
5814
5815         return ret;
5816 }
5817
5818 static void ring_buffer_attach(struct perf_event *event,
5819                                struct perf_buffer *rb)
5820 {
5821         struct perf_buffer *old_rb = NULL;
5822         unsigned long flags;
5823
5824         if (event->rb) {
5825                 /*
5826                  * Should be impossible, we set this when removing
5827                  * event->rb_entry and wait/clear when adding event->rb_entry.
5828                  */
5829                 WARN_ON_ONCE(event->rcu_pending);
5830
5831                 old_rb = event->rb;
5832                 spin_lock_irqsave(&old_rb->event_lock, flags);
5833                 list_del_rcu(&event->rb_entry);
5834                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
5835
5836                 event->rcu_batches = get_state_synchronize_rcu();
5837                 event->rcu_pending = 1;
5838         }
5839
5840         if (rb) {
5841                 if (event->rcu_pending) {
5842                         cond_synchronize_rcu(event->rcu_batches);
5843                         event->rcu_pending = 0;
5844                 }
5845
5846                 spin_lock_irqsave(&rb->event_lock, flags);
5847                 list_add_rcu(&event->rb_entry, &rb->event_list);
5848                 spin_unlock_irqrestore(&rb->event_lock, flags);
5849         }
5850
5851         /*
5852          * Avoid racing with perf_mmap_close(AUX): stop the event
5853          * before swizzling the event::rb pointer; if it's getting
5854          * unmapped, its aux_mmap_count will be 0 and it won't
5855          * restart. See the comment in __perf_pmu_output_stop().
5856          *
5857          * Data will inevitably be lost when set_output is done in
5858          * mid-air, but then again, whoever does it like this is
5859          * not in for the data anyway.
5860          */
5861         if (has_aux(event))
5862                 perf_event_stop(event, 0);
5863
5864         rcu_assign_pointer(event->rb, rb);
5865
5866         if (old_rb) {
5867                 ring_buffer_put(old_rb);
5868                 /*
5869                  * Since we detached before setting the new rb, so that we
5870                  * could attach the new rb, we could have missed a wakeup.
5871                  * Provide it now.
5872                  */
5873                 wake_up_all(&event->waitq);
5874         }
5875 }
5876
5877 static void ring_buffer_wakeup(struct perf_event *event)
5878 {
5879         struct perf_buffer *rb;
5880
5881         rcu_read_lock();
5882         rb = rcu_dereference(event->rb);
5883         if (rb) {
5884                 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5885                         wake_up_all(&event->waitq);
5886         }
5887         rcu_read_unlock();
5888 }
5889
5890 struct perf_buffer *ring_buffer_get(struct perf_event *event)
5891 {
5892         struct perf_buffer *rb;
5893
5894         rcu_read_lock();
5895         rb = rcu_dereference(event->rb);
5896         if (rb) {
5897                 if (!refcount_inc_not_zero(&rb->refcount))
5898                         rb = NULL;
5899         }
5900         rcu_read_unlock();
5901
5902         return rb;
5903 }
5904
5905 void ring_buffer_put(struct perf_buffer *rb)
5906 {
5907         if (!refcount_dec_and_test(&rb->refcount))
5908                 return;
5909
5910         WARN_ON_ONCE(!list_empty(&rb->event_list));
5911
5912         call_rcu(&rb->rcu_head, rb_free_rcu);
5913 }
5914
5915 static void perf_mmap_open(struct vm_area_struct *vma)
5916 {
5917         struct perf_event *event = vma->vm_file->private_data;
5918
5919         atomic_inc(&event->mmap_count);
5920         atomic_inc(&event->rb->mmap_count);
5921
5922         if (vma->vm_pgoff)
5923                 atomic_inc(&event->rb->aux_mmap_count);
5924
5925         if (event->pmu->event_mapped)
5926                 event->pmu->event_mapped(event, vma->vm_mm);
5927 }
5928
5929 static void perf_pmu_output_stop(struct perf_event *event);
5930
5931 /*
5932  * A buffer can be mmap()ed multiple times; either directly through the same
5933  * event, or through other events by use of perf_event_set_output().
5934  *
5935  * In order to undo the VM accounting done by perf_mmap() we need to destroy
5936  * the buffer here, where we still have a VM context. This means we need
5937  * to detach all events redirecting to us.
5938  */
5939 static void perf_mmap_close(struct vm_area_struct *vma)
5940 {
5941         struct perf_event *event = vma->vm_file->private_data;
5942         struct perf_buffer *rb = ring_buffer_get(event);
5943         struct user_struct *mmap_user = rb->mmap_user;
5944         int mmap_locked = rb->mmap_locked;
5945         unsigned long size = perf_data_size(rb);
5946         bool detach_rest = false;
5947
5948         if (event->pmu->event_unmapped)
5949                 event->pmu->event_unmapped(event, vma->vm_mm);
5950
5951         /*
5952          * rb->aux_mmap_count will always drop before rb->mmap_count and
5953          * event->mmap_count, so it is ok to use event->mmap_mutex to
5954          * serialize with perf_mmap here.
5955          */
5956         if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5957             atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
5958                 /*
5959                  * Stop all AUX events that are writing to this buffer,
5960                  * so that we can free its AUX pages and corresponding PMU
5961                  * data. Note that after rb::aux_mmap_count dropped to zero,
5962                  * they won't start any more (see perf_aux_output_begin()).
5963                  */
5964                 perf_pmu_output_stop(event);
5965
5966                 /* now it's safe to free the pages */
5967                 atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
5968                 atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
5969
5970                 /* this has to be the last one */
5971                 rb_free_aux(rb);
5972                 WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
5973
5974                 mutex_unlock(&event->mmap_mutex);
5975         }
5976
5977         if (atomic_dec_and_test(&rb->mmap_count))
5978                 detach_rest = true;
5979
5980         if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5981                 goto out_put;
5982
5983         ring_buffer_attach(event, NULL);
5984         mutex_unlock(&event->mmap_mutex);
5985
5986         /* If there's still other mmap()s of this buffer, we're done. */
5987         if (!detach_rest)
5988                 goto out_put;
5989
5990         /*
5991          * No other mmap()s, detach from all other events that might redirect
5992          * into the now unreachable buffer. Somewhat complicated by the
5993          * fact that rb::event_lock otherwise nests inside mmap_mutex.
5994          */
5995 again:
5996         rcu_read_lock();
5997         list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5998                 if (!atomic_long_inc_not_zero(&event->refcount)) {
5999                         /*
6000                          * This event is en-route to free_event() which will
6001                          * detach it and remove it from the list.
6002                          */
6003                         continue;
6004                 }
6005                 rcu_read_unlock();
6006
6007                 mutex_lock(&event->mmap_mutex);
6008                 /*
6009                  * Check we didn't race with perf_event_set_output() which can
6010                  * swizzle the rb from under us while we were waiting to
6011                  * acquire mmap_mutex.
6012                  *
6013                  * If we find a different rb; ignore this event, a next
6014                  * iteration will no longer find it on the list. We have to
6015                  * still restart the iteration to make sure we're not now
6016                  * iterating the wrong list.
6017                  */
6018                 if (event->rb == rb)
6019                         ring_buffer_attach(event, NULL);
6020
6021                 mutex_unlock(&event->mmap_mutex);
6022                 put_event(event);
6023
6024                 /*
6025                  * Restart the iteration; either we're on the wrong list or
6026                  * destroyed its integrity by doing a deletion.
6027                  */
6028                 goto again;
6029         }
6030         rcu_read_unlock();
6031
6032         /*
6033          * It could be there's still a few 0-ref events on the list; they'll
6034          * get cleaned up by free_event() -- they'll also still have their
6035          * ref on the rb and will free it whenever they are done with it.
6036          *
6037          * Aside from that, this buffer is 'fully' detached and unmapped,
6038          * undo the VM accounting.
6039          */
6040
6041         atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
6042                         &mmap_user->locked_vm);
6043         atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
6044         free_uid(mmap_user);
6045
6046 out_put:
6047         ring_buffer_put(rb); /* could be last */
6048 }
6049
6050 static const struct vm_operations_struct perf_mmap_vmops = {
6051         .open           = perf_mmap_open,
6052         .close          = perf_mmap_close, /* non mergeable */
6053         .fault          = perf_mmap_fault,
6054         .page_mkwrite   = perf_mmap_fault,
6055 };
6056
6057 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
6058 {
6059         struct perf_event *event = file->private_data;
6060         unsigned long user_locked, user_lock_limit;
6061         struct user_struct *user = current_user();
6062         struct perf_buffer *rb = NULL;
6063         unsigned long locked, lock_limit;
6064         unsigned long vma_size;
6065         unsigned long nr_pages;
6066         long user_extra = 0, extra = 0;
6067         int ret = 0, flags = 0;
6068
6069         /*
6070          * Don't allow mmap() of inherited per-task counters. This would
6071          * create a performance issue due to all children writing to the
6072          * same rb.
6073          */
6074         if (event->cpu == -1 && event->attr.inherit)
6075                 return -EINVAL;
6076
6077         if (!(vma->vm_flags & VM_SHARED))
6078                 return -EINVAL;
6079
6080         ret = security_perf_event_read(event);
6081         if (ret)
6082                 return ret;
6083
6084         vma_size = vma->vm_end - vma->vm_start;
6085
6086         if (vma->vm_pgoff == 0) {
6087                 nr_pages = (vma_size / PAGE_SIZE) - 1;
6088         } else {
6089                 /*
6090                  * AUX area mapping: if rb->aux_nr_pages != 0, it's already
6091                  * mapped, all subsequent mappings should have the same size
6092                  * and offset. Must be above the normal perf buffer.
6093                  */
6094                 u64 aux_offset, aux_size;
6095
6096                 if (!event->rb)
6097                         return -EINVAL;
6098
6099                 nr_pages = vma_size / PAGE_SIZE;
6100
6101                 mutex_lock(&event->mmap_mutex);
6102                 ret = -EINVAL;
6103
6104                 rb = event->rb;
6105                 if (!rb)
6106                         goto aux_unlock;
6107
6108                 aux_offset = READ_ONCE(rb->user_page->aux_offset);
6109                 aux_size = READ_ONCE(rb->user_page->aux_size);
6110
6111                 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
6112                         goto aux_unlock;
6113
6114                 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
6115                         goto aux_unlock;
6116
6117                 /* already mapped with a different offset */
6118                 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
6119                         goto aux_unlock;
6120
6121                 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
6122                         goto aux_unlock;
6123
6124                 /* already mapped with a different size */
6125                 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
6126                         goto aux_unlock;
6127
6128                 if (!is_power_of_2(nr_pages))
6129                         goto aux_unlock;
6130
6131                 if (!atomic_inc_not_zero(&rb->mmap_count))
6132                         goto aux_unlock;
6133
6134                 if (rb_has_aux(rb)) {
6135                         atomic_inc(&rb->aux_mmap_count);
6136                         ret = 0;
6137                         goto unlock;
6138                 }
6139
6140                 atomic_set(&rb->aux_mmap_count, 1);
6141                 user_extra = nr_pages;
6142
6143                 goto accounting;
6144         }
6145
6146         /*
6147          * If we have rb pages ensure they're a power-of-two number, so we
6148          * can do bitmasks instead of modulo.
6149          */
6150         if (nr_pages != 0 && !is_power_of_2(nr_pages))
6151                 return -EINVAL;
6152
6153         if (vma_size != PAGE_SIZE * (1 + nr_pages))
6154                 return -EINVAL;
6155
6156         WARN_ON_ONCE(event->ctx->parent_ctx);
6157 again:
6158         mutex_lock(&event->mmap_mutex);
6159         if (event->rb) {
6160                 if (event->rb->nr_pages != nr_pages) {
6161                         ret = -EINVAL;
6162                         goto unlock;
6163                 }
6164
6165                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
6166                         /*
6167                          * Raced against perf_mmap_close() through
6168                          * perf_event_set_output(). Try again, hope for better
6169                          * luck.
6170                          */
6171                         mutex_unlock(&event->mmap_mutex);
6172                         goto again;
6173                 }
6174
6175                 goto unlock;
6176         }
6177
6178         user_extra = nr_pages + 1;
6179
6180 accounting:
6181         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
6182
6183         /*
6184          * Increase the limit linearly with more CPUs:
6185          */
6186         user_lock_limit *= num_online_cpus();
6187
6188         user_locked = atomic_long_read(&user->locked_vm);
6189
6190         /*
6191          * sysctl_perf_event_mlock may have changed, so that
6192          *     user->locked_vm > user_lock_limit
6193          */
6194         if (user_locked > user_lock_limit)
6195                 user_locked = user_lock_limit;
6196         user_locked += user_extra;
6197
6198         if (user_locked > user_lock_limit) {
6199                 /*
6200                  * charge locked_vm until it hits user_lock_limit;
6201                  * charge the rest from pinned_vm
6202                  */
6203                 extra = user_locked - user_lock_limit;
6204                 user_extra -= extra;
6205         }
6206
6207         lock_limit = rlimit(RLIMIT_MEMLOCK);
6208         lock_limit >>= PAGE_SHIFT;
6209         locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
6210
6211         if ((locked > lock_limit) && perf_is_paranoid() &&
6212                 !capable(CAP_IPC_LOCK)) {
6213                 ret = -EPERM;
6214                 goto unlock;
6215         }
6216
6217         WARN_ON(!rb && event->rb);
6218
6219         if (vma->vm_flags & VM_WRITE)
6220                 flags |= RING_BUFFER_WRITABLE;
6221
6222         if (!rb) {
6223                 rb = rb_alloc(nr_pages,
6224                               event->attr.watermark ? event->attr.wakeup_watermark : 0,
6225                               event->cpu, flags);
6226
6227                 if (!rb) {
6228                         ret = -ENOMEM;
6229                         goto unlock;
6230                 }
6231
6232                 atomic_set(&rb->mmap_count, 1);
6233                 rb->mmap_user = get_current_user();
6234                 rb->mmap_locked = extra;
6235
6236                 ring_buffer_attach(event, rb);
6237
6238                 perf_event_init_userpage(event);
6239                 perf_event_update_userpage(event);
6240         } else {
6241                 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
6242                                    event->attr.aux_watermark, flags);
6243                 if (!ret)
6244                         rb->aux_mmap_locked = extra;
6245         }
6246
6247 unlock:
6248         if (!ret) {
6249                 atomic_long_add(user_extra, &user->locked_vm);
6250                 atomic64_add(extra, &vma->vm_mm->pinned_vm);
6251
6252                 atomic_inc(&event->mmap_count);
6253         } else if (rb) {
6254                 atomic_dec(&rb->mmap_count);
6255         }
6256 aux_unlock:
6257         mutex_unlock(&event->mmap_mutex);
6258
6259         /*
6260          * Since pinned accounting is per vm we cannot allow fork() to copy our
6261          * vma.
6262          */
6263         vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
6264         vma->vm_ops = &perf_mmap_vmops;
6265
6266         if (event->pmu->event_mapped)
6267                 event->pmu->event_mapped(event, vma->vm_mm);
6268
6269         return ret;
6270 }
6271
6272 static int perf_fasync(int fd, struct file *filp, int on)
6273 {
6274         struct inode *inode = file_inode(filp);
6275         struct perf_event *event = filp->private_data;
6276         int retval;
6277
6278         inode_lock(inode);
6279         retval = fasync_helper(fd, filp, on, &event->fasync);
6280         inode_unlock(inode);
6281
6282         if (retval < 0)
6283                 return retval;
6284
6285         return 0;
6286 }
6287
6288 static const struct file_operations perf_fops = {
6289         .llseek                 = no_llseek,
6290         .release                = perf_release,
6291         .read                   = perf_read,
6292         .poll                   = perf_poll,
6293         .unlocked_ioctl         = perf_ioctl,
6294         .compat_ioctl           = perf_compat_ioctl,
6295         .mmap                   = perf_mmap,
6296         .fasync                 = perf_fasync,
6297 };
6298
6299 /*
6300  * Perf event wakeup
6301  *
6302  * If there's data, ensure we set the poll() state and publish everything
6303  * to user-space before waking everybody up.
6304  */
6305
6306 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
6307 {
6308         /* only the parent has fasync state */
6309         if (event->parent)
6310                 event = event->parent;
6311         return &event->fasync;
6312 }
6313
6314 void perf_event_wakeup(struct perf_event *event)
6315 {
6316         ring_buffer_wakeup(event);
6317
6318         if (event->pending_kill) {
6319                 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
6320                 event->pending_kill = 0;
6321         }
6322 }
6323
6324 static void perf_pending_event_disable(struct perf_event *event)
6325 {
6326         int cpu = READ_ONCE(event->pending_disable);
6327
6328         if (cpu < 0)
6329                 return;
6330
6331         if (cpu == smp_processor_id()) {
6332                 WRITE_ONCE(event->pending_disable, -1);
6333                 perf_event_disable_local(event);
6334                 return;
6335         }
6336
6337         /*
6338          *  CPU-A                       CPU-B
6339          *
6340          *  perf_event_disable_inatomic()
6341          *    @pending_disable = CPU-A;
6342          *    irq_work_queue();
6343          *
6344          *  sched-out
6345          *    @pending_disable = -1;
6346          *
6347          *                              sched-in
6348          *                              perf_event_disable_inatomic()
6349          *                                @pending_disable = CPU-B;
6350          *                                irq_work_queue(); // FAILS
6351          *
6352          *  irq_work_run()
6353          *    perf_pending_event()
6354          *
6355          * But the event runs on CPU-B and wants disabling there.
6356          */
6357         irq_work_queue_on(&event->pending, cpu);
6358 }
6359
6360 static void perf_pending_event(struct irq_work *entry)
6361 {
6362         struct perf_event *event = container_of(entry, struct perf_event, pending);
6363         int rctx;
6364
6365         rctx = perf_swevent_get_recursion_context();
6366         /*
6367          * If we 'fail' here, that's OK, it means recursion is already disabled
6368          * and we won't recurse 'further'.
6369          */
6370
6371         perf_pending_event_disable(event);
6372
6373         if (event->pending_wakeup) {
6374                 event->pending_wakeup = 0;
6375                 perf_event_wakeup(event);
6376         }
6377
6378         if (rctx >= 0)
6379                 perf_swevent_put_recursion_context(rctx);
6380 }
6381
6382 /*
6383  * We assume there is only KVM supporting the callbacks.
6384  * Later on, we might change it to a list if there is
6385  * another virtualization implementation supporting the callbacks.
6386  */
6387 struct perf_guest_info_callbacks *perf_guest_cbs;
6388
6389 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6390 {
6391         perf_guest_cbs = cbs;
6392         return 0;
6393 }
6394 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
6395
6396 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
6397 {
6398         perf_guest_cbs = NULL;
6399         return 0;
6400 }
6401 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
6402
6403 static void
6404 perf_output_sample_regs(struct perf_output_handle *handle,
6405                         struct pt_regs *regs, u64 mask)
6406 {
6407         int bit;
6408         DECLARE_BITMAP(_mask, 64);
6409
6410         bitmap_from_u64(_mask, mask);
6411         for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
6412                 u64 val;
6413
6414                 val = perf_reg_value(regs, bit);
6415                 perf_output_put(handle, val);
6416         }
6417 }
6418
6419 static void perf_sample_regs_user(struct perf_regs *regs_user,
6420                                   struct pt_regs *regs)
6421 {
6422         if (user_mode(regs)) {
6423                 regs_user->abi = perf_reg_abi(current);
6424                 regs_user->regs = regs;
6425         } else if (!(current->flags & PF_KTHREAD)) {
6426                 perf_get_regs_user(regs_user, regs);
6427         } else {
6428                 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
6429                 regs_user->regs = NULL;
6430         }
6431 }
6432
6433 static void perf_sample_regs_intr(struct perf_regs *regs_intr,
6434                                   struct pt_regs *regs)
6435 {
6436         regs_intr->regs = regs;
6437         regs_intr->abi  = perf_reg_abi(current);
6438 }
6439
6440
6441 /*
6442  * Get remaining task size from user stack pointer.
6443  *
6444  * It'd be better to take stack vma map and limit this more
6445  * precisely, but there's no way to get it safely under interrupt,
6446  * so using TASK_SIZE as limit.
6447  */
6448 static u64 perf_ustack_task_size(struct pt_regs *regs)
6449 {
6450         unsigned long addr = perf_user_stack_pointer(regs);
6451
6452         if (!addr || addr >= TASK_SIZE)
6453                 return 0;
6454
6455         return TASK_SIZE - addr;
6456 }
6457
6458 static u16
6459 perf_sample_ustack_size(u16 stack_size, u16 header_size,
6460                         struct pt_regs *regs)
6461 {
6462         u64 task_size;
6463
6464         /* No regs, no stack pointer, no dump. */
6465         if (!regs)
6466                 return 0;
6467
6468         /*
6469          * Check if we fit in with the requested stack size into the:
6470          * - TASK_SIZE
6471          *   If we don't, we limit the size to the TASK_SIZE.
6472          *
6473          * - remaining sample size
6474          *   If we don't, we customize the stack size to
6475          *   fit in to the remaining sample size.
6476          */
6477
6478         task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
6479         stack_size = min(stack_size, (u16) task_size);
6480
6481         /* Current header size plus static size and dynamic size. */
6482         header_size += 2 * sizeof(u64);
6483
6484         /* Do we fit in with the current stack dump size? */
6485         if ((u16) (header_size + stack_size) < header_size) {
6486                 /*
6487                  * If we overflow the maximum size for the sample,
6488                  * we customize the stack dump size to fit in.
6489                  */
6490                 stack_size = USHRT_MAX - header_size - sizeof(u64);
6491                 stack_size = round_up(stack_size, sizeof(u64));
6492         }
6493
6494         return stack_size;
6495 }
6496
6497 static void
6498 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
6499                           struct pt_regs *regs)
6500 {
6501         /* Case of a kernel thread, nothing to dump */
6502         if (!regs) {
6503                 u64 size = 0;
6504                 perf_output_put(handle, size);
6505         } else {
6506                 unsigned long sp;
6507                 unsigned int rem;
6508                 u64 dyn_size;
6509                 mm_segment_t fs;
6510
6511                 /*
6512                  * We dump:
6513                  * static size
6514                  *   - the size requested by user or the best one we can fit
6515                  *     in to the sample max size
6516                  * data
6517                  *   - user stack dump data
6518                  * dynamic size
6519                  *   - the actual dumped size
6520                  */
6521
6522                 /* Static size. */
6523                 perf_output_put(handle, dump_size);
6524
6525                 /* Data. */
6526                 sp = perf_user_stack_pointer(regs);
6527                 fs = force_uaccess_begin();
6528                 rem = __output_copy_user(handle, (void *) sp, dump_size);
6529                 force_uaccess_end(fs);
6530                 dyn_size = dump_size - rem;
6531
6532                 perf_output_skip(handle, rem);
6533
6534                 /* Dynamic size. */
6535                 perf_output_put(handle, dyn_size);
6536         }
6537 }
6538
6539 static unsigned long perf_prepare_sample_aux(struct perf_event *event,
6540                                           struct perf_sample_data *data,
6541                                           size_t size)
6542 {
6543         struct perf_event *sampler = event->aux_event;
6544         struct perf_buffer *rb;
6545
6546         data->aux_size = 0;
6547
6548         if (!sampler)
6549                 goto out;
6550
6551         if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
6552                 goto out;
6553
6554         if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
6555                 goto out;
6556
6557         rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
6558         if (!rb)
6559                 goto out;
6560
6561         /*
6562          * If this is an NMI hit inside sampling code, don't take
6563          * the sample. See also perf_aux_sample_output().
6564          */
6565         if (READ_ONCE(rb->aux_in_sampling)) {
6566                 data->aux_size = 0;
6567         } else {
6568                 size = min_t(size_t, size, perf_aux_size(rb));
6569                 data->aux_size = ALIGN(size, sizeof(u64));
6570         }
6571         ring_buffer_put(rb);
6572
6573 out:
6574         return data->aux_size;
6575 }
6576
6577 long perf_pmu_snapshot_aux(struct perf_buffer *rb,
6578                            struct perf_event *event,
6579                            struct perf_output_handle *handle,
6580                            unsigned long size)
6581 {
6582         unsigned long flags;
6583         long ret;
6584
6585         /*
6586          * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
6587          * paths. If we start calling them in NMI context, they may race with
6588          * the IRQ ones, that is, for example, re-starting an event that's just
6589          * been stopped, which is why we're using a separate callback that
6590          * doesn't change the event state.
6591          *
6592          * IRQs need to be disabled to prevent IPIs from racing with us.
6593          */
6594         local_irq_save(flags);
6595         /*
6596          * Guard against NMI hits inside the critical section;
6597          * see also perf_prepare_sample_aux().
6598          */
6599         WRITE_ONCE(rb->aux_in_sampling, 1);
6600         barrier();
6601
6602         ret = event->pmu->snapshot_aux(event, handle, size);
6603
6604         barrier();
6605         WRITE_ONCE(rb->aux_in_sampling, 0);
6606         local_irq_restore(flags);
6607
6608         return ret;
6609 }
6610
6611 static void perf_aux_sample_output(struct perf_event *event,
6612                                    struct perf_output_handle *handle,
6613                                    struct perf_sample_data *data)
6614 {
6615         struct perf_event *sampler = event->aux_event;
6616         struct perf_buffer *rb;
6617         unsigned long pad;
6618         long size;
6619
6620         if (WARN_ON_ONCE(!sampler || !data->aux_size))
6621                 return;
6622
6623         rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
6624         if (!rb)
6625                 return;
6626
6627         size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
6628
6629         /*
6630          * An error here means that perf_output_copy() failed (returned a
6631          * non-zero surplus that it didn't copy), which in its current
6632          * enlightened implementation is not possible. If that changes, we'd
6633          * like to know.
6634          */
6635         if (WARN_ON_ONCE(size < 0))
6636                 goto out_put;
6637
6638         /*
6639          * The pad comes from ALIGN()ing data->aux_size up to u64 in
6640          * perf_prepare_sample_aux(), so should not be more than that.
6641          */
6642         pad = data->aux_size - size;
6643         if (WARN_ON_ONCE(pad >= sizeof(u64)))
6644                 pad = 8;
6645
6646         if (pad) {
6647                 u64 zero = 0;
6648                 perf_output_copy(handle, &zero, pad);
6649         }
6650
6651 out_put:
6652         ring_buffer_put(rb);
6653 }
6654
6655 static void __perf_event_header__init_id(struct perf_event_header *header,
6656                                          struct perf_sample_data *data,
6657                                          struct perf_event *event)
6658 {
6659         u64 sample_type = event->attr.sample_type;
6660
6661         data->type = sample_type;
6662         header->size += event->id_header_size;
6663
6664         if (sample_type & PERF_SAMPLE_TID) {
6665                 /* namespace issues */
6666                 data->tid_entry.pid = perf_event_pid(event, current);
6667                 data->tid_entry.tid = perf_event_tid(event, current);
6668         }
6669
6670         if (sample_type & PERF_SAMPLE_TIME)
6671                 data->time = perf_event_clock(event);
6672
6673         if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
6674                 data->id = primary_event_id(event);
6675
6676         if (sample_type & PERF_SAMPLE_STREAM_ID)
6677                 data->stream_id = event->id;
6678
6679         if (sample_type & PERF_SAMPLE_CPU) {
6680                 data->cpu_entry.cpu      = raw_smp_processor_id();
6681                 data->cpu_entry.reserved = 0;
6682         }
6683 }
6684
6685 void perf_event_header__init_id(struct perf_event_header *header,
6686                                 struct perf_sample_data *data,
6687                                 struct perf_event *event)
6688 {
6689         if (event->attr.sample_id_all)
6690                 __perf_event_header__init_id(header, data, event);
6691 }
6692
6693 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
6694                                            struct perf_sample_data *data)
6695 {
6696         u64 sample_type = data->type;
6697
6698         if (sample_type & PERF_SAMPLE_TID)
6699                 perf_output_put(handle, data->tid_entry);
6700
6701         if (sample_type & PERF_SAMPLE_TIME)
6702                 perf_output_put(handle, data->time);
6703
6704         if (sample_type & PERF_SAMPLE_ID)
6705                 perf_output_put(handle, data->id);
6706
6707         if (sample_type & PERF_SAMPLE_STREAM_ID)
6708                 perf_output_put(handle, data->stream_id);
6709
6710         if (sample_type & PERF_SAMPLE_CPU)
6711                 perf_output_put(handle, data->cpu_entry);
6712
6713         if (sample_type & PERF_SAMPLE_IDENTIFIER)
6714                 perf_output_put(handle, data->id);
6715 }
6716
6717 void perf_event__output_id_sample(struct perf_event *event,
6718                                   struct perf_output_handle *handle,
6719                                   struct perf_sample_data *sample)
6720 {
6721         if (event->attr.sample_id_all)
6722                 __perf_event__output_id_sample(handle, sample);
6723 }
6724
6725 static void perf_output_read_one(struct perf_output_handle *handle,
6726                                  struct perf_event *event,
6727                                  u64 enabled, u64 running)
6728 {
6729         u64 read_format = event->attr.read_format;
6730         u64 values[4];
6731         int n = 0;
6732
6733         values[n++] = perf_event_count(event);
6734         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
6735                 values[n++] = enabled +
6736                         atomic64_read(&event->child_total_time_enabled);
6737         }
6738         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
6739                 values[n++] = running +
6740                         atomic64_read(&event->child_total_time_running);
6741         }
6742         if (read_format & PERF_FORMAT_ID)
6743                 values[n++] = primary_event_id(event);
6744
6745         __output_copy(handle, values, n * sizeof(u64));
6746 }
6747
6748 static void perf_output_read_group(struct perf_output_handle *handle,
6749                             struct perf_event *event,
6750                             u64 enabled, u64 running)
6751 {
6752         struct perf_event *leader = event->group_leader, *sub;
6753         u64 read_format = event->attr.read_format;
6754         u64 values[5];
6755         int n = 0;
6756
6757         values[n++] = 1 + leader->nr_siblings;
6758
6759         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
6760                 values[n++] = enabled;
6761
6762         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
6763                 values[n++] = running;
6764
6765         if ((leader != event) &&
6766             (leader->state == PERF_EVENT_STATE_ACTIVE))
6767                 leader->pmu->read(leader);
6768
6769         values[n++] = perf_event_count(leader);
6770         if (read_format & PERF_FORMAT_ID)
6771                 values[n++] = primary_event_id(leader);
6772
6773         __output_copy(handle, values, n * sizeof(u64));
6774
6775         for_each_sibling_event(sub, leader) {
6776                 n = 0;
6777
6778                 if ((sub != event) &&
6779                     (sub->state == PERF_EVENT_STATE_ACTIVE))
6780                         sub->pmu->read(sub);
6781
6782                 values[n++] = perf_event_count(sub);
6783                 if (read_format & PERF_FORMAT_ID)
6784                         values[n++] = primary_event_id(sub);
6785
6786                 __output_copy(handle, values, n * sizeof(u64));
6787         }
6788 }
6789
6790 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
6791                                  PERF_FORMAT_TOTAL_TIME_RUNNING)
6792
6793 /*
6794  * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
6795  *
6796  * The problem is that its both hard and excessively expensive to iterate the
6797  * child list, not to mention that its impossible to IPI the children running
6798  * on another CPU, from interrupt/NMI context.
6799  */
6800 static void perf_output_read(struct perf_output_handle *handle,
6801                              struct perf_event *event)
6802 {
6803         u64 enabled = 0, running = 0, now;
6804         u64 read_format = event->attr.read_format;
6805
6806         /*
6807          * compute total_time_enabled, total_time_running
6808          * based on snapshot values taken when the event
6809          * was last scheduled in.
6810          *
6811          * we cannot simply called update_context_time()
6812          * because of locking issue as we are called in
6813          * NMI context
6814          */
6815         if (read_format & PERF_FORMAT_TOTAL_TIMES)
6816                 calc_timer_values(event, &now, &enabled, &running);
6817
6818         if (event->attr.read_format & PERF_FORMAT_GROUP)
6819                 perf_output_read_group(handle, event, enabled, running);
6820         else
6821                 perf_output_read_one(handle, event, enabled, running);
6822 }
6823
6824 static inline bool perf_sample_save_hw_index(struct perf_event *event)
6825 {
6826         return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
6827 }
6828
6829 void perf_output_sample(struct perf_output_handle *handle,
6830                         struct perf_event_header *header,
6831                         struct perf_sample_data *data,
6832                         struct perf_event *event)
6833 {
6834         u64 sample_type = data->type;
6835
6836         perf_output_put(handle, *header);
6837
6838         if (sample_type & PERF_SAMPLE_IDENTIFIER)
6839                 perf_output_put(handle, data->id);
6840
6841         if (sample_type & PERF_SAMPLE_IP)
6842                 perf_output_put(handle, data->ip);
6843
6844         if (sample_type & PERF_SAMPLE_TID)
6845                 perf_output_put(handle, data->tid_entry);
6846
6847         if (sample_type & PERF_SAMPLE_TIME)
6848                 perf_output_put(handle, data->time);
6849
6850         if (sample_type & PERF_SAMPLE_ADDR)
6851                 perf_output_put(handle, data->addr);
6852
6853         if (sample_type & PERF_SAMPLE_ID)
6854                 perf_output_put(handle, data->id);
6855
6856         if (sample_type & PERF_SAMPLE_STREAM_ID)
6857                 perf_output_put(handle, data->stream_id);
6858
6859         if (sample_type & PERF_SAMPLE_CPU)
6860                 perf_output_put(handle, data->cpu_entry);
6861
6862         if (sample_type & PERF_SAMPLE_PERIOD)
6863                 perf_output_put(handle, data->period);
6864
6865         if (sample_type & PERF_SAMPLE_READ)
6866                 perf_output_read(handle, event);
6867
6868         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6869                 int size = 1;
6870
6871                 size += data->callchain->nr;
6872                 size *= sizeof(u64);
6873                 __output_copy(handle, data->callchain, size);
6874         }
6875
6876         if (sample_type & PERF_SAMPLE_RAW) {
6877                 struct perf_raw_record *raw = data->raw;
6878
6879                 if (raw) {
6880                         struct perf_raw_frag *frag = &raw->frag;
6881
6882                         perf_output_put(handle, raw->size);
6883                         do {
6884                                 if (frag->copy) {
6885                                         __output_custom(handle, frag->copy,
6886                                                         frag->data, frag->size);
6887                                 } else {
6888                                         __output_copy(handle, frag->data,
6889                                                       frag->size);
6890                                 }
6891                                 if (perf_raw_frag_last(frag))
6892                                         break;
6893                                 frag = frag->next;
6894                         } while (1);
6895                         if (frag->pad)
6896                                 __output_skip(handle, NULL, frag->pad);
6897                 } else {
6898                         struct {
6899                                 u32     size;
6900                                 u32     data;
6901                         } raw = {
6902                                 .size = sizeof(u32),
6903                                 .data = 0,
6904                         };
6905                         perf_output_put(handle, raw);
6906                 }
6907         }
6908
6909         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6910                 if (data->br_stack) {
6911                         size_t size;
6912
6913                         size = data->br_stack->nr
6914                              * sizeof(struct perf_branch_entry);
6915
6916                         perf_output_put(handle, data->br_stack->nr);
6917                         if (perf_sample_save_hw_index(event))
6918                                 perf_output_put(handle, data->br_stack->hw_idx);
6919                         perf_output_copy(handle, data->br_stack->entries, size);
6920                 } else {
6921                         /*
6922                          * we always store at least the value of nr
6923                          */
6924                         u64 nr = 0;
6925                         perf_output_put(handle, nr);
6926                 }
6927         }
6928
6929         if (sample_type & PERF_SAMPLE_REGS_USER) {
6930                 u64 abi = data->regs_user.abi;
6931
6932                 /*
6933                  * If there are no regs to dump, notice it through
6934                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
6935                  */
6936                 perf_output_put(handle, abi);
6937
6938                 if (abi) {
6939                         u64 mask = event->attr.sample_regs_user;
6940                         perf_output_sample_regs(handle,
6941                                                 data->regs_user.regs,
6942                                                 mask);
6943                 }
6944         }
6945
6946         if (sample_type & PERF_SAMPLE_STACK_USER) {
6947                 perf_output_sample_ustack(handle,
6948                                           data->stack_user_size,
6949                                           data->regs_user.regs);
6950         }
6951
6952         if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
6953                 perf_output_put(handle, data->weight.full);
6954
6955         if (sample_type & PERF_SAMPLE_DATA_SRC)
6956                 perf_output_put(handle, data->data_src.val);
6957
6958         if (sample_type & PERF_SAMPLE_TRANSACTION)
6959                 perf_output_put(handle, data->txn);
6960
6961         if (sample_type & PERF_SAMPLE_REGS_INTR) {
6962                 u64 abi = data->regs_intr.abi;
6963                 /*
6964                  * If there are no regs to dump, notice it through
6965                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
6966                  */
6967                 perf_output_put(handle, abi);
6968
6969                 if (abi) {
6970                         u64 mask = event->attr.sample_regs_intr;
6971
6972                         perf_output_sample_regs(handle,
6973                                                 data->regs_intr.regs,
6974                                                 mask);
6975                 }
6976         }
6977
6978         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6979                 perf_output_put(handle, data->phys_addr);
6980
6981         if (sample_type & PERF_SAMPLE_CGROUP)
6982                 perf_output_put(handle, data->cgroup);
6983
6984         if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
6985                 perf_output_put(handle, data->data_page_size);
6986
6987         if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
6988                 perf_output_put(handle, data->code_page_size);
6989
6990         if (sample_type & PERF_SAMPLE_AUX) {
6991                 perf_output_put(handle, data->aux_size);
6992
6993                 if (data->aux_size)
6994                         perf_aux_sample_output(event, handle, data);
6995         }
6996
6997         if (!event->attr.watermark) {
6998                 int wakeup_events = event->attr.wakeup_events;
6999
7000                 if (wakeup_events) {
7001                         struct perf_buffer *rb = handle->rb;
7002                         int events = local_inc_return(&rb->events);
7003
7004                         if (events >= wakeup_events) {
7005                                 local_sub(wakeup_events, &rb->events);
7006                                 local_inc(&rb->wakeup);
7007                         }
7008                 }
7009         }
7010 }
7011
7012 static u64 perf_virt_to_phys(u64 virt)
7013 {
7014         u64 phys_addr = 0;
7015         struct page *p = NULL;
7016
7017         if (!virt)
7018                 return 0;
7019
7020         if (virt >= TASK_SIZE) {
7021                 /* If it's vmalloc()d memory, leave phys_addr as 0 */
7022                 if (virt_addr_valid((void *)(uintptr_t)virt) &&
7023                     !(virt >= VMALLOC_START && virt < VMALLOC_END))
7024                         phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
7025         } else {
7026                 /*
7027                  * Walking the pages tables for user address.
7028                  * Interrupts are disabled, so it prevents any tear down
7029                  * of the page tables.
7030                  * Try IRQ-safe get_user_page_fast_only first.
7031                  * If failed, leave phys_addr as 0.
7032                  */
7033                 if (current->mm != NULL) {
7034                         pagefault_disable();
7035                         if (get_user_page_fast_only(virt, 0, &p))
7036                                 phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
7037                         pagefault_enable();
7038                 }
7039
7040                 if (p)
7041                         put_page(p);
7042         }
7043
7044         return phys_addr;
7045 }
7046
7047 /*
7048  * Return the pagetable size of a given virtual address.
7049  */
7050 static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
7051 {
7052         u64 size = 0;
7053
7054 #ifdef CONFIG_HAVE_FAST_GUP
7055         pgd_t *pgdp, pgd;
7056         p4d_t *p4dp, p4d;
7057         pud_t *pudp, pud;
7058         pmd_t *pmdp, pmd;
7059         pte_t *ptep, pte;
7060
7061         pgdp = pgd_offset(mm, addr);
7062         pgd = READ_ONCE(*pgdp);
7063         if (pgd_none(pgd))
7064                 return 0;
7065
7066         if (pgd_leaf(pgd))
7067                 return pgd_leaf_size(pgd);
7068
7069         p4dp = p4d_offset_lockless(pgdp, pgd, addr);
7070         p4d = READ_ONCE(*p4dp);
7071         if (!p4d_present(p4d))
7072                 return 0;
7073
7074         if (p4d_leaf(p4d))
7075                 return p4d_leaf_size(p4d);
7076
7077         pudp = pud_offset_lockless(p4dp, p4d, addr);
7078         pud = READ_ONCE(*pudp);
7079         if (!pud_present(pud))
7080                 return 0;
7081
7082         if (pud_leaf(pud))
7083                 return pud_leaf_size(pud);
7084
7085         pmdp = pmd_offset_lockless(pudp, pud, addr);
7086         pmd = READ_ONCE(*pmdp);
7087         if (!pmd_present(pmd))
7088                 return 0;
7089
7090         if (pmd_leaf(pmd))
7091                 return pmd_leaf_size(pmd);
7092
7093         ptep = pte_offset_map(&pmd, addr);
7094         pte = ptep_get_lockless(ptep);
7095         if (pte_present(pte))
7096                 size = pte_leaf_size(pte);
7097         pte_unmap(ptep);
7098 #endif /* CONFIG_HAVE_FAST_GUP */
7099
7100         return size;
7101 }
7102
7103 static u64 perf_get_page_size(unsigned long addr)
7104 {
7105         struct mm_struct *mm;
7106         unsigned long flags;
7107         u64 size;
7108
7109         if (!addr)
7110                 return 0;
7111
7112         /*
7113          * Software page-table walkers must disable IRQs,
7114          * which prevents any tear down of the page tables.
7115          */
7116         local_irq_save(flags);
7117
7118         mm = current->mm;
7119         if (!mm) {
7120                 /*
7121                  * For kernel threads and the like, use init_mm so that
7122                  * we can find kernel memory.
7123                  */
7124                 mm = &init_mm;
7125         }
7126
7127         size = perf_get_pgtable_size(mm, addr);
7128
7129         local_irq_restore(flags);
7130
7131         return size;
7132 }
7133
7134 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
7135
7136 struct perf_callchain_entry *
7137 perf_callchain(struct perf_event *event, struct pt_regs *regs)
7138 {
7139         bool kernel = !event->attr.exclude_callchain_kernel;
7140         bool user   = !event->attr.exclude_callchain_user;
7141         /* Disallow cross-task user callchains. */
7142         bool crosstask = event->ctx->task && event->ctx->task != current;
7143         const u32 max_stack = event->attr.sample_max_stack;
7144         struct perf_callchain_entry *callchain;
7145
7146         if (!kernel && !user)
7147                 return &__empty_callchain;
7148
7149         callchain = get_perf_callchain(regs, 0, kernel, user,
7150                                        max_stack, crosstask, true);
7151         return callchain ?: &__empty_callchain;
7152 }
7153
7154 void perf_prepare_sample(struct perf_event_header *header,
7155                          struct perf_sample_data *data,
7156                          struct perf_event *event,
7157                          struct pt_regs *regs)
7158 {
7159         u64 sample_type = event->attr.sample_type;
7160
7161         header->type = PERF_RECORD_SAMPLE;
7162         header->size = sizeof(*header) + event->header_size;
7163
7164         header->misc = 0;
7165         header->misc |= perf_misc_flags(regs);
7166
7167         __perf_event_header__init_id(header, data, event);
7168
7169         if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
7170                 data->ip = perf_instruction_pointer(regs);
7171
7172         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
7173                 int size = 1;
7174
7175                 if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
7176                         data->callchain = perf_callchain(event, regs);
7177
7178                 size += data->callchain->nr;
7179
7180                 header->size += size * sizeof(u64);
7181         }
7182
7183         if (sample_type & PERF_SAMPLE_RAW) {
7184                 struct perf_raw_record *raw = data->raw;
7185                 int size;
7186
7187                 if (raw) {
7188                         struct perf_raw_frag *frag = &raw->frag;
7189                         u32 sum = 0;
7190
7191                         do {
7192                                 sum += frag->size;
7193                                 if (perf_raw_frag_last(frag))
7194                                         break;
7195                                 frag = frag->next;
7196                         } while (1);
7197
7198                         size = round_up(sum + sizeof(u32), sizeof(u64));
7199                         raw->size = size - sizeof(u32);
7200                         frag->pad = raw->size - sum;
7201                 } else {
7202                         size = sizeof(u64);
7203                 }
7204
7205                 header->size += size;
7206         }
7207
7208         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
7209                 int size = sizeof(u64); /* nr */
7210                 if (data->br_stack) {
7211                         if (perf_sample_save_hw_index(event))
7212                                 size += sizeof(u64);
7213
7214                         size += data->br_stack->nr
7215                               * sizeof(struct perf_branch_entry);
7216                 }
7217                 header->size += size;
7218         }
7219
7220         if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
7221                 perf_sample_regs_user(&data->regs_user, regs);
7222
7223         if (sample_type & PERF_SAMPLE_REGS_USER) {
7224                 /* regs dump ABI info */
7225                 int size = sizeof(u64);
7226
7227                 if (data->regs_user.regs) {
7228                         u64 mask = event->attr.sample_regs_user;
7229                         size += hweight64(mask) * sizeof(u64);
7230                 }
7231
7232                 header->size += size;
7233         }
7234
7235         if (sample_type & PERF_SAMPLE_STACK_USER) {
7236                 /*
7237                  * Either we need PERF_SAMPLE_STACK_USER bit to be always
7238                  * processed as the last one or have additional check added
7239                  * in case new sample type is added, because we could eat
7240                  * up the rest of the sample size.
7241                  */
7242                 u16 stack_size = event->attr.sample_stack_user;
7243                 u16 size = sizeof(u64);
7244
7245                 stack_size = perf_sample_ustack_size(stack_size, header->size,
7246                                                      data->regs_user.regs);
7247
7248                 /*
7249                  * If there is something to dump, add space for the dump
7250                  * itself and for the field that tells the dynamic size,
7251                  * which is how many have been actually dumped.
7252                  */
7253                 if (stack_size)
7254                         size += sizeof(u64) + stack_size;
7255
7256                 data->stack_user_size = stack_size;
7257                 header->size += size;
7258         }
7259
7260         if (sample_type & PERF_SAMPLE_REGS_INTR) {
7261                 /* regs dump ABI info */
7262                 int size = sizeof(u64);
7263
7264                 perf_sample_regs_intr(&data->regs_intr, regs);
7265
7266                 if (data->regs_intr.regs) {
7267                         u64 mask = event->attr.sample_regs_intr;
7268
7269                         size += hweight64(mask) * sizeof(u64);
7270                 }
7271
7272                 header->size += size;
7273         }
7274
7275         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
7276                 data->phys_addr = perf_virt_to_phys(data->addr);
7277
7278 #ifdef CONFIG_CGROUP_PERF
7279         if (sample_type & PERF_SAMPLE_CGROUP) {
7280                 struct cgroup *cgrp;
7281
7282                 /* protected by RCU */
7283                 cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
7284                 data->cgroup = cgroup_id(cgrp);
7285         }
7286 #endif
7287
7288         /*
7289          * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
7290          * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
7291          * but the value will not dump to the userspace.
7292          */
7293         if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
7294                 data->data_page_size = perf_get_page_size(data->addr);
7295
7296         if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
7297                 data->code_page_size = perf_get_page_size(data->ip);
7298
7299         if (sample_type & PERF_SAMPLE_AUX) {
7300                 u64 size;
7301
7302                 header->size += sizeof(u64); /* size */
7303
7304                 /*
7305                  * Given the 16bit nature of header::size, an AUX sample can
7306                  * easily overflow it, what with all the preceding sample bits.
7307                  * Make sure this doesn't happen by using up to U16_MAX bytes
7308                  * per sample in total (rounded down to 8 byte boundary).
7309                  */
7310                 size = min_t(size_t, U16_MAX - header->size,
7311                              event->attr.aux_sample_size);
7312                 size = rounddown(size, 8);
7313                 size = perf_prepare_sample_aux(event, data, size);
7314
7315                 WARN_ON_ONCE(size + header->size > U16_MAX);
7316                 header->size += size;
7317         }
7318         /*
7319          * If you're adding more sample types here, you likely need to do
7320          * something about the overflowing header::size, like repurpose the
7321          * lowest 3 bits of size, which should be always zero at the moment.
7322          * This raises a more important question, do we really need 512k sized
7323          * samples and why, so good argumentation is in order for whatever you
7324          * do here next.
7325          */
7326         WARN_ON_ONCE(header->size & 7);
7327 }
7328
7329 static __always_inline int
7330 __perf_event_output(struct perf_event *event,
7331                     struct perf_sample_data *data,
7332                     struct pt_regs *regs,
7333                     int (*output_begin)(struct perf_output_handle *,
7334                                         struct perf_sample_data *,
7335                                         struct perf_event *,
7336                                         unsigned int))
7337 {
7338         struct perf_output_handle handle;
7339         struct perf_event_header header;
7340         int err;
7341
7342         /* protect the callchain buffers */
7343         rcu_read_lock();
7344
7345         perf_prepare_sample(&header, data, event, regs);
7346
7347         err = output_begin(&handle, data, event, header.size);
7348         if (err)
7349                 goto exit;
7350
7351         perf_output_sample(&handle, &header, data, event);
7352
7353         perf_output_end(&handle);
7354
7355 exit:
7356         rcu_read_unlock();
7357         return err;
7358 }
7359
7360 void
7361 perf_event_output_forward(struct perf_event *event,
7362                          struct perf_sample_data *data,
7363                          struct pt_regs *regs)
7364 {
7365         __perf_event_output(event, data, regs, perf_output_begin_forward);
7366 }
7367
7368 void
7369 perf_event_output_backward(struct perf_event *event,
7370                            struct perf_sample_data *data,
7371                            struct pt_regs *regs)
7372 {
7373         __perf_event_output(event, data, regs, perf_output_begin_backward);
7374 }
7375
7376 int
7377 perf_event_output(struct perf_event *event,
7378                   struct perf_sample_data *data,
7379                   struct pt_regs *regs)
7380 {
7381         return __perf_event_output(event, data, regs, perf_output_begin);
7382 }
7383
7384 /*
7385  * read event_id
7386  */
7387
7388 struct perf_read_event {
7389         struct perf_event_header        header;
7390
7391         u32                             pid;
7392         u32                             tid;
7393 };
7394
7395 static void
7396 perf_event_read_event(struct perf_event *event,
7397                         struct task_struct *task)
7398 {
7399         struct perf_output_handle handle;
7400         struct perf_sample_data sample;
7401         struct perf_read_event read_event = {
7402                 .header = {
7403                         .type = PERF_RECORD_READ,
7404                         .misc = 0,
7405                         .size = sizeof(read_event) + event->read_size,
7406                 },
7407                 .pid = perf_event_pid(event, task),
7408                 .tid = perf_event_tid(event, task),
7409         };
7410         int ret;
7411
7412         perf_event_header__init_id(&read_event.header, &sample, event);
7413         ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
7414         if (ret)
7415                 return;
7416
7417         perf_output_put(&handle, read_event);
7418         perf_output_read(&handle, event);
7419         perf_event__output_id_sample(event, &handle, &sample);
7420
7421         perf_output_end(&handle);
7422 }
7423
7424 typedef void (perf_iterate_f)(struct perf_event *event, void *data);
7425
7426 static void
7427 perf_iterate_ctx(struct perf_event_context *ctx,
7428                    perf_iterate_f output,
7429                    void *data, bool all)
7430 {
7431         struct perf_event *event;
7432
7433         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
7434                 if (!all) {
7435                         if (event->state < PERF_EVENT_STATE_INACTIVE)
7436                                 continue;
7437                         if (!event_filter_match(event))
7438                                 continue;
7439                 }
7440
7441                 output(event, data);
7442         }
7443 }
7444
7445 static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
7446 {
7447         struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
7448         struct perf_event *event;
7449
7450         list_for_each_entry_rcu(event, &pel->list, sb_list) {
7451                 /*
7452                  * Skip events that are not fully formed yet; ensure that
7453                  * if we observe event->ctx, both event and ctx will be
7454                  * complete enough. See perf_install_in_context().
7455                  */
7456                 if (!smp_load_acquire(&event->ctx))
7457                         continue;
7458
7459                 if (event->state < PERF_EVENT_STATE_INACTIVE)
7460                         continue;
7461                 if (!event_filter_match(event))
7462                         continue;
7463                 output(event, data);
7464         }
7465 }
7466
7467 /*
7468  * Iterate all events that need to receive side-band events.
7469  *
7470  * For new callers; ensure that account_pmu_sb_event() includes
7471  * your event, otherwise it might not get delivered.
7472  */
7473 static void
7474 perf_iterate_sb(perf_iterate_f output, void *data,
7475                struct perf_event_context *task_ctx)
7476 {
7477         struct perf_event_context *ctx;
7478         int ctxn;
7479
7480         rcu_read_lock();
7481         preempt_disable();
7482
7483         /*
7484          * If we have task_ctx != NULL we only notify the task context itself.
7485          * The task_ctx is set only for EXIT events before releasing task
7486          * context.
7487          */
7488         if (task_ctx) {
7489                 perf_iterate_ctx(task_ctx, output, data, false);
7490                 goto done;
7491         }
7492
7493         perf_iterate_sb_cpu(output, data);
7494
7495         for_each_task_context_nr(ctxn) {
7496                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7497                 if (ctx)
7498                         perf_iterate_ctx(ctx, output, data, false);
7499         }
7500 done:
7501         preempt_enable();
7502         rcu_read_unlock();
7503 }
7504
7505 /*
7506  * Clear all file-based filters at exec, they'll have to be
7507  * re-instated when/if these objects are mmapped again.
7508  */
7509 static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
7510 {
7511         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
7512         struct perf_addr_filter *filter;
7513         unsigned int restart = 0, count = 0;
7514         unsigned long flags;
7515
7516         if (!has_addr_filter(event))
7517                 return;
7518
7519         raw_spin_lock_irqsave(&ifh->lock, flags);
7520         list_for_each_entry(filter, &ifh->list, entry) {
7521                 if (filter->path.dentry) {
7522                         event->addr_filter_ranges[count].start = 0;
7523                         event->addr_filter_ranges[count].size = 0;
7524                         restart++;
7525                 }
7526
7527                 count++;
7528         }
7529
7530         if (restart)
7531                 event->addr_filters_gen++;
7532         raw_spin_unlock_irqrestore(&ifh->lock, flags);
7533
7534         if (restart)
7535                 perf_event_stop(event, 1);
7536 }
7537
7538 void perf_event_exec(void)
7539 {
7540         struct perf_event_context *ctx;
7541         int ctxn;
7542
7543         rcu_read_lock();
7544         for_each_task_context_nr(ctxn) {
7545                 ctx = current->perf_event_ctxp[ctxn];
7546                 if (!ctx)
7547                         continue;
7548
7549                 perf_event_enable_on_exec(ctxn);
7550
7551                 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
7552                                    true);
7553         }
7554         rcu_read_unlock();
7555 }
7556
7557 struct remote_output {
7558         struct perf_buffer      *rb;
7559         int                     err;
7560 };
7561
7562 static void __perf_event_output_stop(struct perf_event *event, void *data)
7563 {
7564         struct perf_event *parent = event->parent;
7565         struct remote_output *ro = data;
7566         struct perf_buffer *rb = ro->rb;
7567         struct stop_event_data sd = {
7568                 .event  = event,
7569         };
7570
7571         if (!has_aux(event))
7572                 return;
7573
7574         if (!parent)
7575                 parent = event;
7576
7577         /*
7578          * In case of inheritance, it will be the parent that links to the
7579          * ring-buffer, but it will be the child that's actually using it.
7580          *
7581          * We are using event::rb to determine if the event should be stopped,
7582          * however this may race with ring_buffer_attach() (through set_output),
7583          * which will make us skip the event that actually needs to be stopped.
7584          * So ring_buffer_attach() has to stop an aux event before re-assigning
7585          * its rb pointer.
7586          */
7587         if (rcu_dereference(parent->rb) == rb)
7588                 ro->err = __perf_event_stop(&sd);
7589 }
7590
7591 static int __perf_pmu_output_stop(void *info)
7592 {
7593         struct perf_event *event = info;
7594         struct pmu *pmu = event->ctx->pmu;
7595         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
7596         struct remote_output ro = {
7597                 .rb     = event->rb,
7598         };
7599
7600         rcu_read_lock();
7601         perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
7602         if (cpuctx->task_ctx)
7603                 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
7604                                    &ro, false);
7605         rcu_read_unlock();
7606
7607         return ro.err;
7608 }
7609
7610 static void perf_pmu_output_stop(struct perf_event *event)
7611 {
7612         struct perf_event *iter;
7613         int err, cpu;
7614
7615 restart:
7616         rcu_read_lock();
7617         list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
7618                 /*
7619                  * For per-CPU events, we need to make sure that neither they
7620                  * nor their children are running; for cpu==-1 events it's
7621                  * sufficient to stop the event itself if it's active, since
7622                  * it can't have children.
7623                  */
7624                 cpu = iter->cpu;
7625                 if (cpu == -1)
7626                         cpu = READ_ONCE(iter->oncpu);
7627
7628                 if (cpu == -1)
7629                         continue;
7630
7631                 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
7632                 if (err == -EAGAIN) {
7633                         rcu_read_unlock();
7634                         goto restart;
7635                 }
7636         }
7637         rcu_read_unlock();
7638 }
7639
7640 /*
7641  * task tracking -- fork/exit
7642  *
7643  * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
7644  */
7645
7646 struct perf_task_event {
7647         struct task_struct              *task;
7648         struct perf_event_context       *task_ctx;
7649
7650         struct {
7651                 struct perf_event_header        header;
7652
7653                 u32                             pid;
7654                 u32                             ppid;
7655                 u32                             tid;
7656                 u32                             ptid;
7657                 u64                             time;
7658         } event_id;
7659 };
7660
7661 static int perf_event_task_match(struct perf_event *event)
7662 {
7663         return event->attr.comm  || event->attr.mmap ||
7664                event->attr.mmap2 || event->attr.mmap_data ||
7665                event->attr.task;
7666 }
7667
7668 static void perf_event_task_output(struct perf_event *event,
7669                                    void *data)
7670 {
7671         struct perf_task_event *task_event = data;
7672         struct perf_output_handle handle;
7673         struct perf_sample_data sample;
7674         struct task_struct *task = task_event->task;
7675         int ret, size = task_event->event_id.header.size;
7676
7677         if (!perf_event_task_match(event))
7678                 return;
7679
7680         perf_event_header__init_id(&task_event->event_id.header, &sample, event);
7681
7682         ret = perf_output_begin(&handle, &sample, event,
7683                                 task_event->event_id.header.size);
7684         if (ret)
7685                 goto out;
7686
7687         task_event->event_id.pid = perf_event_pid(event, task);
7688         task_event->event_id.tid = perf_event_tid(event, task);
7689
7690         if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
7691                 task_event->event_id.ppid = perf_event_pid(event,
7692                                                         task->real_parent);
7693                 task_event->event_id.ptid = perf_event_pid(event,
7694                                                         task->real_parent);
7695         } else {  /* PERF_RECORD_FORK */
7696                 task_event->event_id.ppid = perf_event_pid(event, current);
7697                 task_event->event_id.ptid = perf_event_tid(event, current);
7698         }
7699
7700         task_event->event_id.time = perf_event_clock(event);
7701
7702         perf_output_put(&handle, task_event->event_id);
7703
7704         perf_event__output_id_sample(event, &handle, &sample);
7705
7706         perf_output_end(&handle);
7707 out:
7708         task_event->event_id.header.size = size;
7709 }
7710
7711 static void perf_event_task(struct task_struct *task,
7712                               struct perf_event_context *task_ctx,
7713                               int new)
7714 {
7715         struct perf_task_event task_event;
7716
7717         if (!atomic_read(&nr_comm_events) &&
7718             !atomic_read(&nr_mmap_events) &&
7719             !atomic_read(&nr_task_events))
7720                 return;
7721
7722         task_event = (struct perf_task_event){
7723                 .task     = task,
7724                 .task_ctx = task_ctx,
7725                 .event_id    = {
7726                         .header = {
7727                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
7728                                 .misc = 0,
7729                                 .size = sizeof(task_event.event_id),
7730                         },
7731                         /* .pid  */
7732                         /* .ppid */
7733                         /* .tid  */
7734                         /* .ptid */
7735                         /* .time */
7736                 },
7737         };
7738
7739         perf_iterate_sb(perf_event_task_output,
7740                        &task_event,
7741                        task_ctx);
7742 }
7743
7744 void perf_event_fork(struct task_struct *task)
7745 {
7746         perf_event_task(task, NULL, 1);
7747         perf_event_namespaces(task);
7748 }
7749
7750 /*
7751  * comm tracking
7752  */
7753
7754 struct perf_comm_event {
7755         struct task_struct      *task;
7756         char                    *comm;
7757         int                     comm_size;
7758
7759         struct {
7760                 struct perf_event_header        header;
7761
7762                 u32                             pid;
7763                 u32                             tid;
7764         } event_id;
7765 };
7766
7767 static int perf_event_comm_match(struct perf_event *event)
7768 {
7769         return event->attr.comm;
7770 }
7771
7772 static void perf_event_comm_output(struct perf_event *event,
7773                                    void *data)
7774 {
7775         struct perf_comm_event *comm_event = data;
7776         struct perf_output_handle handle;
7777         struct perf_sample_data sample;
7778         int size = comm_event->event_id.header.size;
7779         int ret;
7780
7781         if (!perf_event_comm_match(event))
7782                 return;
7783
7784         perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
7785         ret = perf_output_begin(&handle, &sample, event,
7786                                 comm_event->event_id.header.size);
7787
7788         if (ret)
7789                 goto out;
7790
7791         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
7792         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
7793
7794         perf_output_put(&handle, comm_event->event_id);
7795         __output_copy(&handle, comm_event->comm,
7796                                    comm_event->comm_size);
7797
7798         perf_event__output_id_sample(event, &handle, &sample);
7799
7800         perf_output_end(&handle);
7801 out:
7802         comm_event->event_id.header.size = size;
7803 }
7804
7805 static void perf_event_comm_event(struct perf_comm_event *comm_event)
7806 {
7807         char comm[TASK_COMM_LEN];
7808         unsigned int size;
7809
7810         memset(comm, 0, sizeof(comm));
7811         strlcpy(comm, comm_event->task->comm, sizeof(comm));
7812         size = ALIGN(strlen(comm)+1, sizeof(u64));
7813
7814         comm_event->comm = comm;
7815         comm_event->comm_size = size;
7816
7817         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
7818
7819         perf_iterate_sb(perf_event_comm_output,
7820                        comm_event,
7821                        NULL);
7822 }
7823
7824 void perf_event_comm(struct task_struct *task, bool exec)
7825 {
7826         struct perf_comm_event comm_event;
7827
7828         if (!atomic_read(&nr_comm_events))
7829                 return;
7830
7831         comm_event = (struct perf_comm_event){
7832                 .task   = task,
7833                 /* .comm      */
7834                 /* .comm_size */
7835                 .event_id  = {
7836                         .header = {
7837                                 .type = PERF_RECORD_COMM,
7838                                 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
7839                                 /* .size */
7840                         },
7841                         /* .pid */
7842                         /* .tid */
7843                 },
7844         };
7845
7846         perf_event_comm_event(&comm_event);
7847 }
7848
7849 /*
7850  * namespaces tracking
7851  */
7852
7853 struct perf_namespaces_event {
7854         struct task_struct              *task;
7855
7856         struct {
7857                 struct perf_event_header        header;
7858
7859                 u32                             pid;
7860                 u32                             tid;
7861                 u64                             nr_namespaces;
7862                 struct perf_ns_link_info        link_info[NR_NAMESPACES];
7863         } event_id;
7864 };
7865
7866 static int perf_event_namespaces_match(struct perf_event *event)
7867 {
7868         return event->attr.namespaces;
7869 }
7870
7871 static void perf_event_namespaces_output(struct perf_event *event,
7872                                          void *data)
7873 {
7874         struct perf_namespaces_event *namespaces_event = data;
7875         struct perf_output_handle handle;
7876         struct perf_sample_data sample;
7877         u16 header_size = namespaces_event->event_id.header.size;
7878         int ret;
7879
7880         if (!perf_event_namespaces_match(event))
7881                 return;
7882
7883         perf_event_header__init_id(&namespaces_event->event_id.header,
7884                                    &sample, event);
7885         ret = perf_output_begin(&handle, &sample, event,
7886                                 namespaces_event->event_id.header.size);
7887         if (ret)
7888                 goto out;
7889
7890         namespaces_event->event_id.pid = perf_event_pid(event,
7891                                                         namespaces_event->task);
7892         namespaces_event->event_id.tid = perf_event_tid(event,
7893                                                         namespaces_event->task);
7894
7895         perf_output_put(&handle, namespaces_event->event_id);
7896
7897         perf_event__output_id_sample(event, &handle, &sample);
7898
7899         perf_output_end(&handle);
7900 out:
7901         namespaces_event->event_id.header.size = header_size;
7902 }
7903
7904 static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
7905                                    struct task_struct *task,
7906                                    const struct proc_ns_operations *ns_ops)
7907 {
7908         struct path ns_path;
7909         struct inode *ns_inode;
7910         int error;
7911
7912         error = ns_get_path(&ns_path, task, ns_ops);
7913         if (!error) {
7914                 ns_inode = ns_path.dentry->d_inode;
7915                 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
7916                 ns_link_info->ino = ns_inode->i_ino;
7917                 path_put(&ns_path);
7918         }
7919 }
7920
7921 void perf_event_namespaces(struct task_struct *task)
7922 {
7923         struct perf_namespaces_event namespaces_event;
7924         struct perf_ns_link_info *ns_link_info;
7925
7926         if (!atomic_read(&nr_namespaces_events))
7927                 return;
7928
7929         namespaces_event = (struct perf_namespaces_event){
7930                 .task   = task,
7931                 .event_id  = {
7932                         .header = {
7933                                 .type = PERF_RECORD_NAMESPACES,
7934                                 .misc = 0,
7935                                 .size = sizeof(namespaces_event.event_id),
7936                         },
7937                         /* .pid */
7938                         /* .tid */
7939                         .nr_namespaces = NR_NAMESPACES,
7940                         /* .link_info[NR_NAMESPACES] */
7941                 },
7942         };
7943
7944         ns_link_info = namespaces_event.event_id.link_info;
7945
7946         perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
7947                                task, &mntns_operations);
7948
7949 #ifdef CONFIG_USER_NS
7950         perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
7951                                task, &userns_operations);
7952 #endif
7953 #ifdef CONFIG_NET_NS
7954         perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
7955                                task, &netns_operations);
7956 #endif
7957 #ifdef CONFIG_UTS_NS
7958         perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
7959                                task, &utsns_operations);
7960 #endif
7961 #ifdef CONFIG_IPC_NS
7962         perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
7963                                task, &ipcns_operations);
7964 #endif
7965 #ifdef CONFIG_PID_NS
7966         perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
7967                                task, &pidns_operations);
7968 #endif
7969 #ifdef CONFIG_CGROUPS
7970         perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
7971                                task, &cgroupns_operations);
7972 #endif
7973
7974         perf_iterate_sb(perf_event_namespaces_output,
7975                         &namespaces_event,
7976                         NULL);
7977 }
7978
7979 /*
7980  * cgroup tracking
7981  */
7982 #ifdef CONFIG_CGROUP_PERF
7983
7984 struct perf_cgroup_event {
7985         char                            *path;
7986         int                             path_size;
7987         struct {
7988                 struct perf_event_header        header;
7989                 u64                             id;
7990                 char                            path[];
7991         } event_id;
7992 };
7993
7994 static int perf_event_cgroup_match(struct perf_event *event)
7995 {
7996         return event->attr.cgroup;
7997 }
7998
7999 static void perf_event_cgroup_output(struct perf_event *event, void *data)
8000 {
8001         struct perf_cgroup_event *cgroup_event = data;
8002         struct perf_output_handle handle;
8003         struct perf_sample_data sample;
8004         u16 header_size = cgroup_event->event_id.header.size;
8005         int ret;
8006
8007         if (!perf_event_cgroup_match(event))
8008                 return;
8009
8010         perf_event_header__init_id(&cgroup_event->event_id.header,
8011                                    &sample, event);
8012         ret = perf_output_begin(&handle, &sample, event,
8013                                 cgroup_event->event_id.header.size);
8014         if (ret)
8015                 goto out;
8016
8017         perf_output_put(&handle, cgroup_event->event_id);
8018         __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
8019
8020         perf_event__output_id_sample(event, &handle, &sample);
8021
8022         perf_output_end(&handle);
8023 out:
8024         cgroup_event->event_id.header.size = header_size;
8025 }
8026
8027 static void perf_event_cgroup(struct cgroup *cgrp)
8028 {
8029         struct perf_cgroup_event cgroup_event;
8030         char path_enomem[16] = "//enomem";
8031         char *pathname;
8032         size_t size;
8033
8034         if (!atomic_read(&nr_cgroup_events))
8035                 return;
8036
8037         cgroup_event = (struct perf_cgroup_event){
8038                 .event_id  = {
8039                         .header = {
8040                                 .type = PERF_RECORD_CGROUP,
8041                                 .misc = 0,
8042                                 .size = sizeof(cgroup_event.event_id),
8043                         },
8044                         .id = cgroup_id(cgrp),
8045                 },
8046         };
8047
8048         pathname = kmalloc(PATH_MAX, GFP_KERNEL);
8049         if (pathname == NULL) {
8050                 cgroup_event.path = path_enomem;
8051         } else {
8052                 /* just to be sure to have enough space for alignment */
8053                 cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
8054                 cgroup_event.path = pathname;
8055         }
8056
8057         /*
8058          * Since our buffer works in 8 byte units we need to align our string
8059          * size to a multiple of 8. However, we must guarantee the tail end is
8060          * zero'd out to avoid leaking random bits to userspace.
8061          */
8062         size = strlen(cgroup_event.path) + 1;
8063         while (!IS_ALIGNED(size, sizeof(u64)))
8064                 cgroup_event.path[size++] = '\0';
8065
8066         cgroup_event.event_id.header.size += size;
8067         cgroup_event.path_size = size;
8068
8069         perf_iterate_sb(perf_event_cgroup_output,
8070                         &cgroup_event,
8071                         NULL);
8072
8073         kfree(pathname);
8074 }
8075
8076 #endif
8077
8078 /*
8079  * mmap tracking
8080  */
8081
8082 struct perf_mmap_event {
8083         struct vm_area_struct   *vma;
8084
8085         const char              *file_name;
8086         int                     file_size;
8087         int                     maj, min;
8088         u64                     ino;
8089         u64                     ino_generation;
8090         u32                     prot, flags;
8091         u8                      build_id[BUILD_ID_SIZE_MAX];
8092         u32                     build_id_size;
8093
8094         struct {
8095                 struct perf_event_header        header;
8096
8097                 u32                             pid;
8098                 u32                             tid;
8099                 u64                             start;
8100                 u64                             len;
8101                 u64                             pgoff;
8102         } event_id;
8103 };
8104
8105 static int perf_event_mmap_match(struct perf_event *event,
8106                                  void *data)
8107 {
8108         struct perf_mmap_event *mmap_event = data;
8109         struct vm_area_struct *vma = mmap_event->vma;
8110         int executable = vma->vm_flags & VM_EXEC;
8111
8112         return (!executable && event->attr.mmap_data) ||
8113                (executable && (event->attr.mmap || event->attr.mmap2));
8114 }
8115
8116 static void perf_event_mmap_output(struct perf_event *event,
8117                                    void *data)
8118 {
8119         struct perf_mmap_event *mmap_event = data;
8120         struct perf_output_handle handle;
8121         struct perf_sample_data sample;
8122         int size = mmap_event->event_id.header.size;
8123         u32 type = mmap_event->event_id.header.type;
8124         bool use_build_id;
8125         int ret;
8126
8127         if (!perf_event_mmap_match(event, data))
8128                 return;
8129
8130         if (event->attr.mmap2) {
8131                 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
8132                 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
8133                 mmap_event->event_id.header.size += sizeof(mmap_event->min);
8134                 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
8135                 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
8136                 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
8137                 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
8138         }
8139
8140         perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
8141         ret = perf_output_begin(&handle, &sample, event,
8142                                 mmap_event->event_id.header.size);
8143         if (ret)
8144                 goto out;
8145
8146         mmap_event->event_id.pid = perf_event_pid(event, current);
8147         mmap_event->event_id.tid = perf_event_tid(event, current);
8148
8149         use_build_id = event->attr.build_id && mmap_event->build_id_size;
8150
8151         if (event->attr.mmap2 && use_build_id)
8152                 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;
8153
8154         perf_output_put(&handle, mmap_event->event_id);
8155
8156         if (event->attr.mmap2) {
8157                 if (use_build_id) {
8158                         u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };
8159
8160                         __output_copy(&handle, size, 4);
8161                         __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
8162                 } else {
8163                         perf_output_put(&handle, mmap_event->maj);
8164                         perf_output_put(&handle, mmap_event->min);
8165                         perf_output_put(&handle, mmap_event->ino);
8166                         perf_output_put(&handle, mmap_event->ino_generation);
8167                 }
8168                 perf_output_put(&handle, mmap_event->prot);
8169                 perf_output_put(&handle, mmap_event->flags);
8170         }
8171
8172         __output_copy(&handle, mmap_event->file_name,
8173                                    mmap_event->file_size);
8174
8175         perf_event__output_id_sample(event, &handle, &sample);
8176
8177         perf_output_end(&handle);
8178 out:
8179         mmap_event->event_id.header.size = size;
8180         mmap_event->event_id.header.type = type;
8181 }
8182
8183 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
8184 {
8185         struct vm_area_struct *vma = mmap_event->vma;
8186         struct file *file = vma->vm_file;
8187         int maj = 0, min = 0;
8188         u64 ino = 0, gen = 0;
8189         u32 prot = 0, flags = 0;
8190         unsigned int size;
8191         char tmp[16];
8192         char *buf = NULL;
8193         char *name;
8194
8195         if (vma->vm_flags & VM_READ)
8196                 prot |= PROT_READ;
8197         if (vma->vm_flags & VM_WRITE)
8198                 prot |= PROT_WRITE;
8199         if (vma->vm_flags & VM_EXEC)
8200                 prot |= PROT_EXEC;
8201
8202         if (vma->vm_flags & VM_MAYSHARE)
8203                 flags = MAP_SHARED;
8204         else
8205                 flags = MAP_PRIVATE;
8206
8207         if (vma->vm_flags & VM_DENYWRITE)
8208                 flags |= MAP_DENYWRITE;
8209         if (vma->vm_flags & VM_MAYEXEC)
8210                 flags |= MAP_EXECUTABLE;
8211         if (vma->vm_flags & VM_LOCKED)
8212                 flags |= MAP_LOCKED;
8213         if (is_vm_hugetlb_page(vma))
8214                 flags |= MAP_HUGETLB;
8215
8216         if (file) {
8217                 struct inode *inode;
8218                 dev_t dev;
8219
8220                 buf = kmalloc(PATH_MAX, GFP_KERNEL);
8221                 if (!buf) {
8222                         name = "//enomem";
8223                         goto cpy_name;
8224                 }
8225                 /*
8226                  * d_path() works from the end of the rb backwards, so we
8227                  * need to add enough zero bytes after the string to handle
8228                  * the 64bit alignment we do later.
8229                  */
8230                 name = file_path(file, buf, PATH_MAX - sizeof(u64));
8231                 if (IS_ERR(name)) {
8232                         name = "//toolong";
8233                         goto cpy_name;
8234                 }
8235                 inode = file_inode(vma->vm_file);
8236                 dev = inode->i_sb->s_dev;
8237                 ino = inode->i_ino;
8238                 gen = inode->i_generation;
8239                 maj = MAJOR(dev);
8240                 min = MINOR(dev);
8241
8242                 goto got_name;
8243         } else {
8244                 if (vma->vm_ops && vma->vm_ops->name) {
8245                         name = (char *) vma->vm_ops->name(vma);
8246                         if (name)
8247                                 goto cpy_name;
8248                 }
8249
8250                 name = (char *)arch_vma_name(vma);
8251                 if (name)
8252                         goto cpy_name;
8253
8254                 if (vma->vm_start <= vma->vm_mm->start_brk &&
8255                                 vma->vm_end >= vma->vm_mm->brk) {
8256                         name = "[heap]";
8257                         goto cpy_name;
8258                 }
8259                 if (vma->vm_start <= vma->vm_mm->start_stack &&
8260                                 vma->vm_end >= vma->vm_mm->start_stack) {
8261                         name = "[stack]";
8262                         goto cpy_name;
8263                 }
8264
8265                 name = "//anon";
8266                 goto cpy_name;
8267         }
8268
8269 cpy_name:
8270         strlcpy(tmp, name, sizeof(tmp));
8271         name = tmp;
8272 got_name:
8273         /*
8274          * Since our buffer works in 8 byte units we need to align our string
8275          * size to a multiple of 8. However, we must guarantee the tail end is
8276          * zero'd out to avoid leaking random bits to userspace.
8277          */
8278         size = strlen(name)+1;
8279         while (!IS_ALIGNED(size, sizeof(u64)))
8280                 name[size++] = '\0';
8281
8282         mmap_event->file_name = name;
8283         mmap_event->file_size = size;
8284         mmap_event->maj = maj;
8285         mmap_event->min = min;
8286         mmap_event->ino = ino;
8287         mmap_event->ino_generation = gen;
8288         mmap_event->prot = prot;
8289         mmap_event->flags = flags;
8290
8291         if (!(vma->vm_flags & VM_EXEC))
8292                 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
8293
8294         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
8295
8296         if (atomic_read(&nr_build_id_events))
8297                 build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
8298
8299         perf_iterate_sb(perf_event_mmap_output,
8300                        mmap_event,
8301                        NULL);
8302
8303         kfree(buf);
8304 }
8305
8306 /*
8307  * Check whether inode and address range match filter criteria.
8308  */
8309 static bool perf_addr_filter_match(struct perf_addr_filter *filter,
8310                                      struct file *file, unsigned long offset,
8311                                      unsigned long size)
8312 {
8313         /* d_inode(NULL) won't be equal to any mapped user-space file */
8314         if (!filter->path.dentry)
8315                 return false;
8316
8317         if (d_inode(filter->path.dentry) != file_inode(file))
8318                 return false;
8319
8320         if (filter->offset > offset + size)
8321                 return false;
8322
8323         if (filter->offset + filter->size < offset)
8324                 return false;
8325
8326         return true;
8327 }
8328
8329 static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
8330                                         struct vm_area_struct *vma,
8331                                         struct perf_addr_filter_range *fr)
8332 {
8333         unsigned long vma_size = vma->vm_end - vma->vm_start;
8334         unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
8335         struct file *file = vma->vm_file;
8336
8337         if (!perf_addr_filter_match(filter, file, off, vma_size))
8338                 return false;
8339
8340         if (filter->offset < off) {
8341                 fr->start = vma->vm_start;
8342                 fr->size = min(vma_size, filter->size - (off - filter->offset));
8343         } else {
8344                 fr->start = vma->vm_start + filter->offset - off;
8345                 fr->size = min(vma->vm_end - fr->start, filter->size);
8346         }
8347
8348         return true;
8349 }
8350
8351 static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
8352 {
8353         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8354         struct vm_area_struct *vma = data;
8355         struct perf_addr_filter *filter;
8356         unsigned int restart = 0, count = 0;
8357         unsigned long flags;
8358
8359         if (!has_addr_filter(event))
8360                 return;
8361
8362         if (!vma->vm_file)
8363                 return;
8364
8365         raw_spin_lock_irqsave(&ifh->lock, flags);
8366         list_for_each_entry(filter, &ifh->list, entry) {
8367                 if (perf_addr_filter_vma_adjust(filter, vma,
8368                                                 &event->addr_filter_ranges[count]))
8369                         restart++;
8370
8371                 count++;
8372         }
8373
8374         if (restart)
8375                 event->addr_filters_gen++;
8376         raw_spin_unlock_irqrestore(&ifh->lock, flags);
8377
8378         if (restart)
8379                 perf_event_stop(event, 1);
8380 }
8381
8382 /*
8383  * Adjust all task's events' filters to the new vma
8384  */
8385 static void perf_addr_filters_adjust(struct vm_area_struct *vma)
8386 {
8387         struct perf_event_context *ctx;
8388         int ctxn;
8389
8390         /*
8391          * Data tracing isn't supported yet and as such there is no need
8392          * to keep track of anything that isn't related to executable code:
8393          */
8394         if (!(vma->vm_flags & VM_EXEC))
8395                 return;
8396
8397         rcu_read_lock();
8398         for_each_task_context_nr(ctxn) {
8399                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
8400                 if (!ctx)
8401                         continue;
8402
8403                 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
8404         }
8405         rcu_read_unlock();
8406 }
8407
8408 void perf_event_mmap(struct vm_area_struct *vma)
8409 {
8410         struct perf_mmap_event mmap_event;
8411
8412         if (!atomic_read(&nr_mmap_events))
8413                 return;
8414
8415         mmap_event = (struct perf_mmap_event){
8416                 .vma    = vma,
8417                 /* .file_name */
8418                 /* .file_size */
8419                 .event_id  = {
8420                         .header = {
8421                                 .type = PERF_RECORD_MMAP,
8422                                 .misc = PERF_RECORD_MISC_USER,
8423                                 /* .size */
8424                         },
8425                         /* .pid */
8426                         /* .tid */
8427                         .start  = vma->vm_start,
8428                         .len    = vma->vm_end - vma->vm_start,
8429                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
8430                 },
8431                 /* .maj (attr_mmap2 only) */
8432                 /* .min (attr_mmap2 only) */
8433                 /* .ino (attr_mmap2 only) */
8434                 /* .ino_generation (attr_mmap2 only) */
8435                 /* .prot (attr_mmap2 only) */
8436                 /* .flags (attr_mmap2 only) */
8437         };
8438
8439         perf_addr_filters_adjust(vma);
8440         perf_event_mmap_event(&mmap_event);
8441 }
8442
8443 void perf_event_aux_event(struct perf_event *event, unsigned long head,
8444                           unsigned long size, u64 flags)
8445 {
8446         struct perf_output_handle handle;
8447         struct perf_sample_data sample;
8448         struct perf_aux_event {
8449                 struct perf_event_header        header;
8450                 u64                             offset;
8451                 u64                             size;
8452                 u64                             flags;
8453         } rec = {
8454                 .header = {
8455                         .type = PERF_RECORD_AUX,
8456                         .misc = 0,
8457                         .size = sizeof(rec),
8458                 },
8459                 .offset         = head,
8460                 .size           = size,
8461                 .flags          = flags,
8462         };
8463         int ret;
8464
8465         perf_event_header__init_id(&rec.header, &sample, event);
8466         ret = perf_output_begin(&handle, &sample, event, rec.header.size);
8467
8468         if (ret)
8469                 return;
8470
8471         perf_output_put(&handle, rec);
8472         perf_event__output_id_sample(event, &handle, &sample);
8473
8474         perf_output_end(&handle);
8475 }
8476
8477 /*
8478  * Lost/dropped samples logging
8479  */
8480 void perf_log_lost_samples(struct perf_event *event, u64 lost)
8481 {
8482         struct perf_output_handle handle;
8483         struct perf_sample_data sample;
8484         int ret;
8485
8486         struct {
8487                 struct perf_event_header        header;
8488                 u64                             lost;
8489         } lost_samples_event = {
8490                 .header = {
8491                         .type = PERF_RECORD_LOST_SAMPLES,
8492                         .misc = 0,
8493                         .size = sizeof(lost_samples_event),
8494                 },
8495                 .lost           = lost,
8496         };
8497
8498         perf_event_header__init_id(&lost_samples_event.header, &sample, event);
8499
8500         ret = perf_output_begin(&handle, &sample, event,
8501                                 lost_samples_event.header.size);
8502         if (ret)
8503                 return;
8504
8505         perf_output_put(&handle, lost_samples_event);
8506         perf_event__output_id_sample(event, &handle, &sample);
8507         perf_output_end(&handle);
8508 }
8509
8510 /*
8511  * context_switch tracking
8512  */
8513
8514 struct perf_switch_event {
8515         struct task_struct      *task;
8516         struct task_struct      *next_prev;
8517
8518         struct {
8519                 struct perf_event_header        header;
8520                 u32                             next_prev_pid;
8521                 u32                             next_prev_tid;
8522         } event_id;
8523 };
8524
8525 static int perf_event_switch_match(struct perf_event *event)
8526 {
8527         return event->attr.context_switch;
8528 }
8529
8530 static void perf_event_switch_output(struct perf_event *event, void *data)
8531 {
8532         struct perf_switch_event *se = data;
8533         struct perf_output_handle handle;
8534         struct perf_sample_data sample;
8535         int ret;
8536
8537         if (!perf_event_switch_match(event))
8538                 return;
8539
8540         /* Only CPU-wide events are allowed to see next/prev pid/tid */
8541         if (event->ctx->task) {
8542                 se->event_id.header.type = PERF_RECORD_SWITCH;
8543                 se->event_id.header.size = sizeof(se->event_id.header);
8544         } else {
8545                 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
8546                 se->event_id.header.size = sizeof(se->event_id);
8547                 se->event_id.next_prev_pid =
8548                                         perf_event_pid(event, se->next_prev);
8549                 se->event_id.next_prev_tid =
8550                                         perf_event_tid(event, se->next_prev);
8551         }
8552
8553         perf_event_header__init_id(&se->event_id.header, &sample, event);
8554
8555         ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
8556         if (ret)
8557                 return;
8558
8559         if (event->ctx->task)
8560                 perf_output_put(&handle, se->event_id.header);
8561         else
8562                 perf_output_put(&handle, se->event_id);
8563
8564         perf_event__output_id_sample(event, &handle, &sample);
8565
8566         perf_output_end(&handle);
8567 }
8568
8569 static void perf_event_switch(struct task_struct *task,
8570                               struct task_struct *next_prev, bool sched_in)
8571 {
8572         struct perf_switch_event switch_event;
8573
8574         /* N.B. caller checks nr_switch_events != 0 */
8575
8576         switch_event = (struct perf_switch_event){
8577                 .task           = task,
8578                 .next_prev      = next_prev,
8579                 .event_id       = {
8580                         .header = {
8581                                 /* .type */
8582                                 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
8583                                 /* .size */
8584                         },
8585                         /* .next_prev_pid */
8586                         /* .next_prev_tid */
8587                 },
8588         };
8589
8590         if (!sched_in && task->state == TASK_RUNNING)
8591                 switch_event.event_id.header.misc |=
8592                                 PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
8593
8594         perf_iterate_sb(perf_event_switch_output,
8595                        &switch_event,
8596                        NULL);
8597 }
8598
8599 /*
8600  * IRQ throttle logging
8601  */
8602
8603 static void perf_log_throttle(struct perf_event *event, int enable)
8604 {
8605         struct perf_output_handle handle;
8606         struct perf_sample_data sample;
8607         int ret;
8608
8609         struct {
8610                 struct perf_event_header        header;
8611                 u64                             time;
8612                 u64                             id;
8613                 u64                             stream_id;
8614         } throttle_event = {
8615                 .header = {
8616                         .type = PERF_RECORD_THROTTLE,
8617                         .misc = 0,
8618                         .size = sizeof(throttle_event),
8619                 },
8620                 .time           = perf_event_clock(event),
8621                 .id             = primary_event_id(event),
8622                 .stream_id      = event->id,
8623         };
8624
8625         if (enable)
8626                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
8627
8628         perf_event_header__init_id(&throttle_event.header, &sample, event);
8629
8630         ret = perf_output_begin(&handle, &sample, event,
8631                                 throttle_event.header.size);
8632         if (ret)
8633                 return;
8634
8635         perf_output_put(&handle, throttle_event);
8636         perf_event__output_id_sample(event, &handle, &sample);
8637         perf_output_end(&handle);
8638 }
8639
8640 /*
8641  * ksymbol register/unregister tracking
8642  */
8643
8644 struct perf_ksymbol_event {
8645         const char      *name;
8646         int             name_len;
8647         struct {
8648                 struct perf_event_header        header;
8649                 u64                             addr;
8650                 u32                             len;
8651                 u16                             ksym_type;
8652                 u16                             flags;
8653         } event_id;
8654 };
8655
8656 static int perf_event_ksymbol_match(struct perf_event *event)
8657 {
8658         return event->attr.ksymbol;
8659 }
8660
8661 static void perf_event_ksymbol_output(struct perf_event *event, void *data)
8662 {
8663         struct perf_ksymbol_event *ksymbol_event = data;
8664         struct perf_output_handle handle;
8665         struct perf_sample_data sample;
8666         int ret;
8667
8668         if (!perf_event_ksymbol_match(event))
8669                 return;
8670
8671         perf_event_header__init_id(&ksymbol_event->event_id.header,
8672                                    &sample, event);
8673         ret = perf_output_begin(&handle, &sample, event,
8674                                 ksymbol_event->event_id.header.size);
8675         if (ret)
8676                 return;
8677
8678         perf_output_put(&handle, ksymbol_event->event_id);
8679         __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
8680         perf_event__output_id_sample(event, &handle, &sample);
8681
8682         perf_output_end(&handle);
8683 }
8684
8685 void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
8686                         const char *sym)
8687 {
8688         struct perf_ksymbol_event ksymbol_event;
8689         char name[KSYM_NAME_LEN];
8690         u16 flags = 0;
8691         int name_len;
8692
8693         if (!atomic_read(&nr_ksymbol_events))
8694                 return;
8695
8696         if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
8697             ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
8698                 goto err;
8699
8700         strlcpy(name, sym, KSYM_NAME_LEN);
8701         name_len = strlen(name) + 1;
8702         while (!IS_ALIGNED(name_len, sizeof(u64)))
8703                 name[name_len++] = '\0';
8704         BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
8705
8706         if (unregister)
8707                 flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
8708
8709         ksymbol_event = (struct perf_ksymbol_event){
8710                 .name = name,
8711                 .name_len = name_len,
8712                 .event_id = {
8713                         .header = {
8714                                 .type = PERF_RECORD_KSYMBOL,
8715                                 .size = sizeof(ksymbol_event.event_id) +
8716                                         name_len,
8717                         },
8718                         .addr = addr,
8719                         .len = len,
8720                         .ksym_type = ksym_type,
8721                         .flags = flags,
8722                 },
8723         };
8724
8725         perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
8726         return;
8727 err:
8728         WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
8729 }
8730
8731 /*
8732  * bpf program load/unload tracking
8733  */
8734
8735 struct perf_bpf_event {
8736         struct bpf_prog *prog;
8737         struct {
8738                 struct perf_event_header        header;
8739                 u16                             type;
8740                 u16                             flags;
8741                 u32                             id;
8742                 u8                              tag[BPF_TAG_SIZE];
8743         } event_id;
8744 };
8745
8746 static int perf_event_bpf_match(struct perf_event *event)
8747 {
8748         return event->attr.bpf_event;
8749 }
8750
8751 static void perf_event_bpf_output(struct perf_event *event, void *data)
8752 {
8753         struct perf_bpf_event *bpf_event = data;
8754         struct perf_output_handle handle;
8755         struct perf_sample_data sample;
8756         int ret;
8757
8758         if (!perf_event_bpf_match(event))
8759                 return;
8760
8761         perf_event_header__init_id(&bpf_event->event_id.header,
8762                                    &sample, event);
8763         ret = perf_output_begin(&handle, data, event,
8764                                 bpf_event->event_id.header.size);
8765         if (ret)
8766                 return;
8767
8768         perf_output_put(&handle, bpf_event->event_id);
8769         perf_event__output_id_sample(event, &handle, &sample);
8770
8771         perf_output_end(&handle);
8772 }
8773
8774 static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
8775                                          enum perf_bpf_event_type type)
8776 {
8777         bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
8778         int i;
8779
8780         if (prog->aux->func_cnt == 0) {
8781                 perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
8782                                    (u64)(unsigned long)prog->bpf_func,
8783                                    prog->jited_len, unregister,
8784                                    prog->aux->ksym.name);
8785         } else {
8786                 for (i = 0; i < prog->aux->func_cnt; i++) {
8787                         struct bpf_prog *subprog = prog->aux->func[i];
8788
8789                         perf_event_ksymbol(
8790                                 PERF_RECORD_KSYMBOL_TYPE_BPF,
8791                                 (u64)(unsigned long)subprog->bpf_func,
8792                                 subprog->jited_len, unregister,
8793                                 prog->aux->ksym.name);
8794                 }
8795         }
8796 }
8797
8798 void perf_event_bpf_event(struct bpf_prog *prog,
8799                           enum perf_bpf_event_type type,
8800                           u16 flags)
8801 {
8802         struct perf_bpf_event bpf_event;
8803
8804         if (type <= PERF_BPF_EVENT_UNKNOWN ||
8805             type >= PERF_BPF_EVENT_MAX)
8806                 return;
8807
8808         switch (type) {
8809         case PERF_BPF_EVENT_PROG_LOAD:
8810         case PERF_BPF_EVENT_PROG_UNLOAD:
8811                 if (atomic_read(&nr_ksymbol_events))
8812                         perf_event_bpf_emit_ksymbols(prog, type);
8813                 break;
8814         default:
8815                 break;
8816         }
8817
8818         if (!atomic_read(&nr_bpf_events))
8819                 return;
8820
8821         bpf_event = (struct perf_bpf_event){
8822                 .prog = prog,
8823                 .event_id = {
8824                         .header = {
8825                                 .type = PERF_RECORD_BPF_EVENT,
8826                                 .size = sizeof(bpf_event.event_id),
8827                         },
8828                         .type = type,
8829                         .flags = flags,
8830                         .id = prog->aux->id,
8831                 },
8832         };
8833
8834         BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
8835
8836         memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
8837         perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
8838 }
8839
8840 struct perf_text_poke_event {
8841         const void              *old_bytes;
8842         const void              *new_bytes;
8843         size_t                  pad;
8844         u16                     old_len;
8845         u16                     new_len;
8846
8847         struct {
8848                 struct perf_event_header        header;
8849
8850                 u64                             addr;
8851         } event_id;
8852 };
8853
8854 static int perf_event_text_poke_match(struct perf_event *event)
8855 {
8856         return event->attr.text_poke;
8857 }
8858
8859 static void perf_event_text_poke_output(struct perf_event *event, void *data)
8860 {
8861         struct perf_text_poke_event *text_poke_event = data;
8862         struct perf_output_handle handle;
8863         struct perf_sample_data sample;
8864         u64 padding = 0;
8865         int ret;
8866
8867         if (!perf_event_text_poke_match(event))
8868                 return;
8869
8870         perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
8871
8872         ret = perf_output_begin(&handle, &sample, event,
8873                                 text_poke_event->event_id.header.size);
8874         if (ret)
8875                 return;
8876
8877         perf_output_put(&handle, text_poke_event->event_id);
8878         perf_output_put(&handle, text_poke_event->old_len);
8879         perf_output_put(&handle, text_poke_event->new_len);
8880
8881         __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
8882         __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
8883
8884         if (text_poke_event->pad)
8885                 __output_copy(&handle, &padding, text_poke_event->pad);
8886
8887         perf_event__output_id_sample(event, &handle, &sample);
8888
8889         perf_output_end(&handle);
8890 }
8891
8892 void perf_event_text_poke(const void *addr, const void *old_bytes,
8893                           size_t old_len, const void *new_bytes, size_t new_len)
8894 {
8895         struct perf_text_poke_event text_poke_event;
8896         size_t tot, pad;
8897
8898         if (!atomic_read(&nr_text_poke_events))
8899                 return;
8900
8901         tot  = sizeof(text_poke_event.old_len) + old_len;
8902         tot += sizeof(text_poke_event.new_len) + new_len;
8903         pad  = ALIGN(tot, sizeof(u64)) - tot;
8904
8905         text_poke_event = (struct perf_text_poke_event){
8906                 .old_bytes    = old_bytes,
8907                 .new_bytes    = new_bytes,
8908                 .pad          = pad,
8909                 .old_len      = old_len,
8910                 .new_len      = new_len,
8911                 .event_id  = {
8912                         .header = {
8913                                 .type = PERF_RECORD_TEXT_POKE,
8914                                 .misc = PERF_RECORD_MISC_KERNEL,
8915                                 .size = sizeof(text_poke_event.event_id) + tot + pad,
8916                         },
8917                         .addr = (unsigned long)addr,
8918                 },
8919         };
8920
8921         perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
8922 }
8923
8924 void perf_event_itrace_started(struct perf_event *event)
8925 {
8926         event->attach_state |= PERF_ATTACH_ITRACE;
8927 }
8928
8929 static void perf_log_itrace_start(struct perf_event *event)
8930 {
8931         struct perf_output_handle handle;
8932         struct perf_sample_data sample;
8933         struct perf_aux_event {
8934                 struct perf_event_header        header;
8935                 u32                             pid;
8936                 u32                             tid;
8937         } rec;
8938         int ret;
8939
8940         if (event->parent)
8941                 event = event->parent;
8942
8943         if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
8944             event->attach_state & PERF_ATTACH_ITRACE)
8945                 return;
8946
8947         rec.header.type = PERF_RECORD_ITRACE_START;
8948         rec.header.misc = 0;
8949         rec.header.size = sizeof(rec);
8950         rec.pid = perf_event_pid(event, current);
8951         rec.tid = perf_event_tid(event, current);
8952
8953         perf_event_header__init_id(&rec.header, &sample, event);
8954         ret = perf_output_begin(&handle, &sample, event, rec.header.size);
8955
8956         if (ret)
8957                 return;
8958
8959         perf_output_put(&handle, rec);
8960         perf_event__output_id_sample(event, &handle, &sample);
8961
8962         perf_output_end(&handle);
8963 }
8964
8965 static int
8966 __perf_event_account_interrupt(struct perf_event *event, int throttle)
8967 {
8968         struct hw_perf_event *hwc = &event->hw;
8969         int ret = 0;
8970         u64 seq;
8971
8972         seq = __this_cpu_read(perf_throttled_seq);
8973         if (seq != hwc->interrupts_seq) {
8974                 hwc->interrupts_seq = seq;
8975                 hwc->interrupts = 1;
8976         } else {
8977                 hwc->interrupts++;
8978                 if (unlikely(throttle
8979                              && hwc->interrupts >= max_samples_per_tick)) {
8980                         __this_cpu_inc(perf_throttled_count);
8981                         tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
8982                         hwc->interrupts = MAX_INTERRUPTS;
8983                         perf_log_throttle(event, 0);
8984                         ret = 1;
8985                 }
8986         }
8987
8988         if (event->attr.freq) {
8989                 u64 now = perf_clock();
8990                 s64 delta = now - hwc->freq_time_stamp;
8991
8992                 hwc->freq_time_stamp = now;
8993
8994                 if (delta > 0 && delta < 2*TICK_NSEC)
8995                         perf_adjust_period(event, delta, hwc->last_period, true);
8996         }
8997
8998         return ret;
8999 }
9000
9001 int perf_event_account_interrupt(struct perf_event *event)
9002 {
9003         return __perf_event_account_interrupt(event, 1);
9004 }
9005
9006 /*
9007  * Generic event overflow handling, sampling.
9008  */
9009
9010 static int __perf_event_overflow(struct perf_event *event,
9011                                    int throttle, struct perf_sample_data *data,
9012                                    struct pt_regs *regs)
9013 {
9014         int events = atomic_read(&event->event_limit);
9015         int ret = 0;
9016
9017         /*
9018          * Non-sampling counters might still use the PMI to fold short
9019          * hardware counters, ignore those.
9020          */
9021         if (unlikely(!is_sampling_event(event)))
9022                 return 0;
9023
9024         ret = __perf_event_account_interrupt(event, throttle);
9025
9026         /*
9027          * XXX event_limit might not quite work as expected on inherited
9028          * events
9029          */
9030
9031         event->pending_kill = POLL_IN;
9032         if (events && atomic_dec_and_test(&event->event_limit)) {
9033                 ret = 1;
9034                 event->pending_kill = POLL_HUP;
9035
9036                 perf_event_disable_inatomic(event);
9037         }
9038
9039         READ_ONCE(event->overflow_handler)(event, data, regs);
9040
9041         if (*perf_event_fasync(event) && event->pending_kill) {
9042                 event->pending_wakeup = 1;
9043                 irq_work_queue(&event->pending);
9044         }
9045
9046         return ret;
9047 }
9048
9049 int perf_event_overflow(struct perf_event *event,
9050                           struct perf_sample_data *data,
9051                           struct pt_regs *regs)
9052 {
9053         return __perf_event_overflow(event, 1, data, regs);
9054 }
9055
9056 /*
9057  * Generic software event infrastructure
9058  */
9059
9060 struct swevent_htable {
9061         struct swevent_hlist            *swevent_hlist;
9062         struct mutex                    hlist_mutex;
9063         int                             hlist_refcount;
9064
9065         /* Recursion avoidance in each contexts */
9066         int                             recursion[PERF_NR_CONTEXTS];
9067 };
9068
9069 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
9070
9071 /*
9072  * We directly increment event->count and keep a second value in
9073  * event->hw.period_left to count intervals. This period event
9074  * is kept in the range [-sample_period, 0] so that we can use the
9075  * sign as trigger.
9076  */
9077
9078 u64 perf_swevent_set_period(struct perf_event *event)
9079 {
9080         struct hw_perf_event *hwc = &event->hw;
9081         u64 period = hwc->last_period;
9082         u64 nr, offset;
9083         s64 old, val;
9084
9085         hwc->last_period = hwc->sample_period;
9086
9087 again:
9088         old = val = local64_read(&hwc->period_left);
9089         if (val < 0)
9090                 return 0;
9091
9092         nr = div64_u64(period + val, period);
9093         offset = nr * period;
9094         val -= offset;
9095         if (local64_cmpxchg(&hwc->period_left, old, val) != old)
9096                 goto again;
9097
9098         return nr;
9099 }
9100
9101 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
9102                                     struct perf_sample_data *data,
9103                                     struct pt_regs *regs)
9104 {
9105         struct hw_perf_event *hwc = &event->hw;
9106         int throttle = 0;
9107
9108         if (!overflow)
9109                 overflow = perf_swevent_set_period(event);
9110
9111         if (hwc->interrupts == MAX_INTERRUPTS)
9112                 return;
9113
9114         for (; overflow; overflow--) {
9115                 if (__perf_event_overflow(event, throttle,
9116                                             data, regs)) {
9117                         /*
9118                          * We inhibit the overflow from happening when
9119                          * hwc->interrupts == MAX_INTERRUPTS.
9120                          */
9121                         break;
9122                 }
9123                 throttle = 1;
9124         }
9125 }
9126
9127 static void perf_swevent_event(struct perf_event *event, u64 nr,
9128                                struct perf_sample_data *data,
9129                                struct pt_regs *regs)
9130 {
9131         struct hw_perf_event *hwc = &event->hw;
9132
9133         local64_add(nr, &event->count);
9134
9135         if (!regs)
9136                 return;
9137
9138         if (!is_sampling_event(event))
9139                 return;
9140
9141         if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
9142                 data->period = nr;
9143                 return perf_swevent_overflow(event, 1, data, regs);
9144         } else
9145                 data->period = event->hw.last_period;
9146
9147         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
9148                 return perf_swevent_overflow(event, 1, data, regs);
9149
9150         if (local64_add_negative(nr, &hwc->period_left))
9151                 return;
9152
9153         perf_swevent_overflow(event, 0, data, regs);
9154 }
9155
9156 static int perf_exclude_event(struct perf_event *event,
9157                               struct pt_regs *regs)
9158 {
9159         if (event->hw.state & PERF_HES_STOPPED)
9160                 return 1;
9161
9162         if (regs) {
9163                 if (event->attr.exclude_user && user_mode(regs))
9164                         return 1;
9165
9166                 if (event->attr.exclude_kernel && !user_mode(regs))
9167                         return 1;
9168         }
9169
9170         return 0;
9171 }
9172
9173 static int perf_swevent_match(struct perf_event *event,
9174                                 enum perf_type_id type,
9175                                 u32 event_id,
9176                                 struct perf_sample_data *data,
9177                                 struct pt_regs *regs)
9178 {
9179         if (event->attr.type != type)
9180                 return 0;
9181
9182         if (event->attr.config != event_id)
9183                 return 0;
9184
9185         if (perf_exclude_event(event, regs))
9186                 return 0;
9187
9188         return 1;
9189 }
9190
9191 static inline u64 swevent_hash(u64 type, u32 event_id)
9192 {
9193         u64 val = event_id | (type << 32);
9194
9195         return hash_64(val, SWEVENT_HLIST_BITS);
9196 }
9197
9198 static inline struct hlist_head *
9199 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
9200 {
9201         u64 hash = swevent_hash(type, event_id);
9202
9203         return &hlist->heads[hash];
9204 }
9205
9206 /* For the read side: events when they trigger */
9207 static inline struct hlist_head *
9208 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
9209 {
9210         struct swevent_hlist *hlist;
9211
9212         hlist = rcu_dereference(swhash->swevent_hlist);
9213         if (!hlist)
9214                 return NULL;
9215
9216         return __find_swevent_head(hlist, type, event_id);
9217 }
9218
9219 /* For the event head insertion and removal in the hlist */
9220 static inline struct hlist_head *
9221 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
9222 {
9223         struct swevent_hlist *hlist;
9224         u32 event_id = event->attr.config;
9225         u64 type = event->attr.type;
9226
9227         /*
9228          * Event scheduling is always serialized against hlist allocation
9229          * and release. Which makes the protected version suitable here.
9230          * The context lock guarantees that.
9231          */
9232         hlist = rcu_dereference_protected(swhash->swevent_hlist,
9233                                           lockdep_is_held(&event->ctx->lock));
9234         if (!hlist)
9235                 return NULL;
9236
9237         return __find_swevent_head(hlist, type, event_id);
9238 }
9239
9240 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
9241                                     u64 nr,
9242                                     struct perf_sample_data *data,
9243                                     struct pt_regs *regs)
9244 {
9245         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9246         struct perf_event *event;
9247         struct hlist_head *head;
9248
9249         rcu_read_lock();
9250         head = find_swevent_head_rcu(swhash, type, event_id);
9251         if (!head)
9252                 goto end;
9253
9254         hlist_for_each_entry_rcu(event, head, hlist_entry) {
9255                 if (perf_swevent_match(event, type, event_id, data, regs))
9256                         perf_swevent_event(event, nr, data, regs);
9257         }
9258 end:
9259         rcu_read_unlock();
9260 }
9261
9262 DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
9263
9264 int perf_swevent_get_recursion_context(void)
9265 {
9266         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9267
9268         return get_recursion_context(swhash->recursion);
9269 }
9270 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
9271
9272 void perf_swevent_put_recursion_context(int rctx)
9273 {
9274         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9275
9276         put_recursion_context(swhash->recursion, rctx);
9277 }
9278
9279 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
9280 {
9281         struct perf_sample_data data;
9282
9283         if (WARN_ON_ONCE(!regs))
9284                 return;
9285
9286         perf_sample_data_init(&data, addr, 0);
9287         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
9288 }
9289
9290 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
9291 {
9292         int rctx;
9293
9294         preempt_disable_notrace();
9295         rctx = perf_swevent_get_recursion_context();
9296         if (unlikely(rctx < 0))
9297                 goto fail;
9298
9299         ___perf_sw_event(event_id, nr, regs, addr);
9300
9301         perf_swevent_put_recursion_context(rctx);
9302 fail:
9303         preempt_enable_notrace();
9304 }
9305
9306 static void perf_swevent_read(struct perf_event *event)
9307 {
9308 }
9309
9310 static int perf_swevent_add(struct perf_event *event, int flags)
9311 {
9312         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
9313         struct hw_perf_event *hwc = &event->hw;
9314         struct hlist_head *head;
9315
9316         if (is_sampling_event(event)) {
9317                 hwc->last_period = hwc->sample_period;
9318                 perf_swevent_set_period(event);
9319         }
9320
9321         hwc->state = !(flags & PERF_EF_START);
9322
9323         head = find_swevent_head(swhash, event);
9324         if (WARN_ON_ONCE(!head))
9325                 return -EINVAL;
9326
9327         hlist_add_head_rcu(&event->hlist_entry, head);
9328         perf_event_update_userpage(event);
9329
9330         return 0;
9331 }
9332
9333 static void perf_swevent_del(struct perf_event *event, int flags)
9334 {
9335         hlist_del_rcu(&event->hlist_entry);
9336 }
9337
9338 static void perf_swevent_start(struct perf_event *event, int flags)
9339 {
9340         event->hw.state = 0;
9341 }
9342
9343 static void perf_swevent_stop(struct perf_event *event, int flags)
9344 {
9345         event->hw.state = PERF_HES_STOPPED;
9346 }
9347
9348 /* Deref the hlist from the update side */
9349 static inline struct swevent_hlist *
9350 swevent_hlist_deref(struct swevent_htable *swhash)
9351 {
9352         return rcu_dereference_protected(swhash->swevent_hlist,
9353                                          lockdep_is_held(&swhash->hlist_mutex));
9354 }
9355
9356 static void swevent_hlist_release(struct swevent_htable *swhash)
9357 {
9358         struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
9359
9360         if (!hlist)
9361                 return;
9362
9363         RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
9364         kfree_rcu(hlist, rcu_head);
9365 }
9366
9367 static void swevent_hlist_put_cpu(int cpu)
9368 {
9369         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9370
9371         mutex_lock(&swhash->hlist_mutex);
9372
9373         if (!--swhash->hlist_refcount)
9374                 swevent_hlist_release(swhash);
9375
9376         mutex_unlock(&swhash->hlist_mutex);
9377 }
9378
9379 static void swevent_hlist_put(void)
9380 {
9381         int cpu;
9382
9383         for_each_possible_cpu(cpu)
9384                 swevent_hlist_put_cpu(cpu);
9385 }
9386
9387 static int swevent_hlist_get_cpu(int cpu)
9388 {
9389         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
9390         int err = 0;
9391
9392         mutex_lock(&swhash->hlist_mutex);
9393         if (!swevent_hlist_deref(swhash) &&
9394             cpumask_test_cpu(cpu, perf_online_mask)) {
9395                 struct swevent_hlist *hlist;
9396
9397                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
9398                 if (!hlist) {
9399                         err = -ENOMEM;
9400                         goto exit;
9401                 }
9402                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
9403         }
9404         swhash->hlist_refcount++;
9405 exit:
9406         mutex_unlock(&swhash->hlist_mutex);
9407
9408         return err;
9409 }
9410
9411 static int swevent_hlist_get(void)
9412 {
9413         int err, cpu, failed_cpu;
9414
9415         mutex_lock(&pmus_lock);
9416         for_each_possible_cpu(cpu) {
9417                 err = swevent_hlist_get_cpu(cpu);
9418                 if (err) {
9419                         failed_cpu = cpu;
9420                         goto fail;
9421                 }
9422         }
9423         mutex_unlock(&pmus_lock);
9424         return 0;
9425 fail:
9426         for_each_possible_cpu(cpu) {
9427                 if (cpu == failed_cpu)
9428                         break;
9429                 swevent_hlist_put_cpu(cpu);
9430         }
9431         mutex_unlock(&pmus_lock);
9432         return err;
9433 }
9434
9435 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
9436
9437 static void sw_perf_event_destroy(struct perf_event *event)
9438 {
9439         u64 event_id = event->attr.config;
9440
9441         WARN_ON(event->parent);
9442
9443         static_key_slow_dec(&perf_swevent_enabled[event_id]);
9444         swevent_hlist_put();
9445 }
9446
9447 static int perf_swevent_init(struct perf_event *event)
9448 {
9449         u64 event_id = event->attr.config;
9450
9451         if (event->attr.type != PERF_TYPE_SOFTWARE)
9452                 return -ENOENT;
9453
9454         /*
9455          * no branch sampling for software events
9456          */
9457         if (has_branch_stack(event))
9458                 return -EOPNOTSUPP;
9459
9460         switch (event_id) {
9461         case PERF_COUNT_SW_CPU_CLOCK:
9462         case PERF_COUNT_SW_TASK_CLOCK:
9463                 return -ENOENT;
9464
9465         default:
9466                 break;
9467         }
9468
9469         if (event_id >= PERF_COUNT_SW_MAX)
9470                 return -ENOENT;
9471
9472         if (!event->parent) {
9473                 int err;
9474
9475                 err = swevent_hlist_get();
9476                 if (err)
9477                         return err;
9478
9479                 static_key_slow_inc(&perf_swevent_enabled[event_id]);
9480                 event->destroy = sw_perf_event_destroy;
9481         }
9482
9483         return 0;
9484 }
9485
9486 static struct pmu perf_swevent = {
9487         .task_ctx_nr    = perf_sw_context,
9488
9489         .capabilities   = PERF_PMU_CAP_NO_NMI,
9490
9491         .event_init     = perf_swevent_init,
9492         .add            = perf_swevent_add,
9493         .del            = perf_swevent_del,
9494         .start          = perf_swevent_start,
9495         .stop           = perf_swevent_stop,
9496         .read           = perf_swevent_read,
9497 };
9498
9499 #ifdef CONFIG_EVENT_TRACING
9500
9501 static int perf_tp_filter_match(struct perf_event *event,
9502                                 struct perf_sample_data *data)
9503 {
9504         void *record = data->raw->frag.data;
9505
9506         /* only top level events have filters set */
9507         if (event->parent)
9508                 event = event->parent;
9509
9510         if (likely(!event->filter) || filter_match_preds(event->filter, record))
9511                 return 1;
9512         return 0;
9513 }
9514
9515 static int perf_tp_event_match(struct perf_event *event,
9516                                 struct perf_sample_data *data,
9517                                 struct pt_regs *regs)
9518 {
9519         if (event->hw.state & PERF_HES_STOPPED)
9520                 return 0;
9521         /*
9522          * If exclude_kernel, only trace user-space tracepoints (uprobes)
9523          */
9524         if (event->attr.exclude_kernel && !user_mode(regs))
9525                 return 0;
9526
9527         if (!perf_tp_filter_match(event, data))
9528                 return 0;
9529
9530         return 1;
9531 }
9532
9533 void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
9534                                struct trace_event_call *call, u64 count,
9535                                struct pt_regs *regs, struct hlist_head *head,
9536                                struct task_struct *task)
9537 {
9538         if (bpf_prog_array_valid(call)) {
9539                 *(struct pt_regs **)raw_data = regs;
9540                 if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
9541                         perf_swevent_put_recursion_context(rctx);
9542                         return;
9543                 }
9544         }
9545         perf_tp_event(call->event.type, count, raw_data, size, regs, head,
9546                       rctx, task);
9547 }
9548 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
9549
9550 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
9551                    struct pt_regs *regs, struct hlist_head *head, int rctx,
9552                    struct task_struct *task)
9553 {
9554         struct perf_sample_data data;
9555         struct perf_event *event;
9556
9557         struct perf_raw_record raw = {
9558                 .frag = {
9559                         .size = entry_size,
9560                         .data = record,
9561                 },
9562         };
9563
9564         perf_sample_data_init(&data, 0, 0);
9565         data.raw = &raw;
9566
9567         perf_trace_buf_update(record, event_type);
9568
9569         hlist_for_each_entry_rcu(event, head, hlist_entry) {
9570                 if (perf_tp_event_match(event, &data, regs))
9571                         perf_swevent_event(event, count, &data, regs);
9572         }
9573
9574         /*
9575          * If we got specified a target task, also iterate its context and
9576          * deliver this event there too.
9577          */
9578         if (task && task != current) {
9579                 struct perf_event_context *ctx;
9580                 struct trace_entry *entry = record;
9581
9582                 rcu_read_lock();
9583                 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
9584                 if (!ctx)
9585                         goto unlock;
9586
9587                 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
9588                         if (event->cpu != smp_processor_id())
9589                                 continue;
9590                         if (event->attr.type != PERF_TYPE_TRACEPOINT)
9591                                 continue;
9592                         if (event->attr.config != entry->type)
9593                                 continue;
9594                         if (perf_tp_event_match(event, &data, regs))
9595                                 perf_swevent_event(event, count, &data, regs);
9596                 }
9597 unlock:
9598                 rcu_read_unlock();
9599         }
9600
9601         perf_swevent_put_recursion_context(rctx);
9602 }
9603 EXPORT_SYMBOL_GPL(perf_tp_event);
9604
9605 static void tp_perf_event_destroy(struct perf_event *event)
9606 {
9607         perf_trace_destroy(event);
9608 }
9609
9610 static int perf_tp_event_init(struct perf_event *event)
9611 {
9612         int err;
9613
9614         if (event->attr.type != PERF_TYPE_TRACEPOINT)
9615                 return -ENOENT;
9616
9617         /*
9618          * no branch sampling for tracepoint events
9619          */
9620         if (has_branch_stack(event))
9621                 return -EOPNOTSUPP;
9622
9623         err = perf_trace_init(event);
9624         if (err)
9625                 return err;
9626
9627         event->destroy = tp_perf_event_destroy;
9628
9629         return 0;
9630 }
9631
9632 static struct pmu perf_tracepoint = {
9633         .task_ctx_nr    = perf_sw_context,
9634
9635         .event_init     = perf_tp_event_init,
9636         .add            = perf_trace_add,
9637         .del            = perf_trace_del,
9638         .start          = perf_swevent_start,
9639         .stop           = perf_swevent_stop,
9640         .read           = perf_swevent_read,
9641 };
9642
9643 #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
9644 /*
9645  * Flags in config, used by dynamic PMU kprobe and uprobe
9646  * The flags should match following PMU_FORMAT_ATTR().
9647  *
9648  * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
9649  *                               if not set, create kprobe/uprobe
9650  *
9651  * The following values specify a reference counter (or semaphore in the
9652  * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
9653  * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
9654  *
9655  * PERF_UPROBE_REF_CTR_OFFSET_BITS      # of bits in config as th offset
9656  * PERF_UPROBE_REF_CTR_OFFSET_SHIFT     # of bits to shift left
9657  */
9658 enum perf_probe_config {
9659         PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
9660         PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
9661         PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
9662 };
9663
9664 PMU_FORMAT_ATTR(retprobe, "config:0");
9665 #endif
9666
9667 #ifdef CONFIG_KPROBE_EVENTS
9668 static struct attribute *kprobe_attrs[] = {
9669         &format_attr_retprobe.attr,
9670         NULL,
9671 };
9672
9673 static struct attribute_group kprobe_format_group = {
9674         .name = "format",
9675         .attrs = kprobe_attrs,
9676 };
9677
9678 static const struct attribute_group *kprobe_attr_groups[] = {
9679         &kprobe_format_group,
9680         NULL,
9681 };
9682
9683 static int perf_kprobe_event_init(struct perf_event *event);
9684 static struct pmu perf_kprobe = {
9685         .task_ctx_nr    = perf_sw_context,
9686         .event_init     = perf_kprobe_event_init,
9687         .add            = perf_trace_add,
9688         .del            = perf_trace_del,
9689         .start          = perf_swevent_start,
9690         .stop           = perf_swevent_stop,
9691         .read           = perf_swevent_read,
9692         .attr_groups    = kprobe_attr_groups,
9693 };
9694
9695 static int perf_kprobe_event_init(struct perf_event *event)
9696 {
9697         int err;
9698         bool is_retprobe;
9699
9700         if (event->attr.type != perf_kprobe.type)
9701                 return -ENOENT;
9702
9703         if (!perfmon_capable())
9704                 return -EACCES;
9705
9706         /*
9707          * no branch sampling for probe events
9708          */
9709         if (has_branch_stack(event))
9710                 return -EOPNOTSUPP;
9711
9712         is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
9713         err = perf_kprobe_init(event, is_retprobe);
9714         if (err)
9715                 return err;
9716
9717         event->destroy = perf_kprobe_destroy;
9718
9719         return 0;
9720 }
9721 #endif /* CONFIG_KPROBE_EVENTS */
9722
9723 #ifdef CONFIG_UPROBE_EVENTS
9724 PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
9725
9726 static struct attribute *uprobe_attrs[] = {
9727         &format_attr_retprobe.attr,
9728         &format_attr_ref_ctr_offset.attr,
9729         NULL,
9730 };
9731
9732 static struct attribute_group uprobe_format_group = {
9733         .name = "format",
9734         .attrs = uprobe_attrs,
9735 };
9736
9737 static const struct attribute_group *uprobe_attr_groups[] = {
9738         &uprobe_format_group,
9739         NULL,
9740 };
9741
9742 static int perf_uprobe_event_init(struct perf_event *event);
9743 static struct pmu perf_uprobe = {
9744         .task_ctx_nr    = perf_sw_context,
9745         .event_init     = perf_uprobe_event_init,
9746         .add            = perf_trace_add,
9747         .del            = perf_trace_del,
9748         .start          = perf_swevent_start,
9749         .stop           = perf_swevent_stop,
9750         .read           = perf_swevent_read,
9751         .attr_groups    = uprobe_attr_groups,
9752 };
9753
9754 static int perf_uprobe_event_init(struct perf_event *event)
9755 {
9756         int err;
9757         unsigned long ref_ctr_offset;
9758         bool is_retprobe;
9759
9760         if (event->attr.type != perf_uprobe.type)
9761                 return -ENOENT;
9762
9763         if (!perfmon_capable())
9764                 return -EACCES;
9765
9766         /*
9767          * no branch sampling for probe events
9768          */
9769         if (has_branch_stack(event))
9770                 return -EOPNOTSUPP;
9771
9772         is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
9773         ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
9774         err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
9775         if (err)
9776                 return err;
9777
9778         event->destroy = perf_uprobe_destroy;
9779
9780         return 0;
9781 }
9782 #endif /* CONFIG_UPROBE_EVENTS */
9783
9784 static inline void perf_tp_register(void)
9785 {
9786         perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
9787 #ifdef CONFIG_KPROBE_EVENTS
9788         perf_pmu_register(&perf_kprobe, "kprobe", -1);
9789 #endif
9790 #ifdef CONFIG_UPROBE_EVENTS
9791         perf_pmu_register(&perf_uprobe, "uprobe", -1);
9792 #endif
9793 }
9794
9795 static void perf_event_free_filter(struct perf_event *event)
9796 {
9797         ftrace_profile_free_filter(event);
9798 }
9799
9800 #ifdef CONFIG_BPF_SYSCALL
9801 static void bpf_overflow_handler(struct perf_event *event,
9802                                  struct perf_sample_data *data,
9803                                  struct pt_regs *regs)
9804 {
9805         struct bpf_perf_event_data_kern ctx = {
9806                 .data = data,
9807                 .event = event,
9808         };
9809         int ret = 0;
9810
9811         ctx.regs = perf_arch_bpf_user_pt_regs(regs);
9812         if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
9813                 goto out;
9814         rcu_read_lock();
9815         ret = BPF_PROG_RUN(event->prog, &ctx);
9816         rcu_read_unlock();
9817 out:
9818         __this_cpu_dec(bpf_prog_active);
9819         if (!ret)
9820                 return;
9821
9822         event->orig_overflow_handler(event, data, regs);
9823 }
9824
9825 static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
9826 {
9827         struct bpf_prog *prog;
9828
9829         if (event->overflow_handler_context)
9830                 /* hw breakpoint or kernel counter */
9831                 return -EINVAL;
9832
9833         if (event->prog)
9834                 return -EEXIST;
9835
9836         prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
9837         if (IS_ERR(prog))
9838                 return PTR_ERR(prog);
9839
9840         if (event->attr.precise_ip &&
9841             prog->call_get_stack &&
9842             (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
9843              event->attr.exclude_callchain_kernel ||
9844              event->attr.exclude_callchain_user)) {
9845                 /*
9846                  * On perf_event with precise_ip, calling bpf_get_stack()
9847                  * may trigger unwinder warnings and occasional crashes.
9848                  * bpf_get_[stack|stackid] works around this issue by using
9849                  * callchain attached to perf_sample_data. If the
9850                  * perf_event does not full (kernel and user) callchain
9851                  * attached to perf_sample_data, do not allow attaching BPF
9852                  * program that calls bpf_get_[stack|stackid].
9853                  */
9854                 bpf_prog_put(prog);
9855                 return -EPROTO;
9856         }
9857
9858         event->prog = prog;
9859         event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
9860         WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
9861         return 0;
9862 }
9863
9864 static void perf_event_free_bpf_handler(struct perf_event *event)
9865 {
9866         struct bpf_prog *prog = event->prog;
9867
9868         if (!prog)
9869                 return;
9870
9871         WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
9872         event->prog = NULL;
9873         bpf_prog_put(prog);
9874 }
9875 #else
9876 static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
9877 {
9878         return -EOPNOTSUPP;
9879 }
9880 static void perf_event_free_bpf_handler(struct perf_event *event)
9881 {
9882 }
9883 #endif
9884
9885 /*
9886  * returns true if the event is a tracepoint, or a kprobe/upprobe created
9887  * with perf_event_open()
9888  */
9889 static inline bool perf_event_is_tracing(struct perf_event *event)
9890 {
9891         if (event->pmu == &perf_tracepoint)
9892                 return true;
9893 #ifdef CONFIG_KPROBE_EVENTS
9894         if (event->pmu == &perf_kprobe)
9895                 return true;
9896 #endif
9897 #ifdef CONFIG_UPROBE_EVENTS
9898         if (event->pmu == &perf_uprobe)
9899                 return true;
9900 #endif
9901         return false;
9902 }
9903
9904 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
9905 {
9906         bool is_kprobe, is_tracepoint, is_syscall_tp;
9907         struct bpf_prog *prog;
9908         int ret;
9909
9910         if (!perf_event_is_tracing(event))
9911                 return perf_event_set_bpf_handler(event, prog_fd);
9912
9913         is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
9914         is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
9915         is_syscall_tp = is_syscall_trace_event(event->tp_event);
9916         if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
9917                 /* bpf programs can only be attached to u/kprobe or tracepoint */
9918                 return -EINVAL;
9919
9920         prog = bpf_prog_get(prog_fd);
9921         if (IS_ERR(prog))
9922                 return PTR_ERR(prog);
9923
9924         if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
9925             (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
9926             (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
9927                 /* valid fd, but invalid bpf program type */
9928                 bpf_prog_put(prog);
9929                 return -EINVAL;
9930         }
9931
9932         /* Kprobe override only works for kprobes, not uprobes. */
9933         if (prog->kprobe_override &&
9934             !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
9935                 bpf_prog_put(prog);
9936                 return -EINVAL;
9937         }
9938
9939         if (is_tracepoint || is_syscall_tp) {
9940                 int off = trace_event_get_offsets(event->tp_event);
9941
9942                 if (prog->aux->max_ctx_offset > off) {
9943                         bpf_prog_put(prog);
9944                         return -EACCES;
9945                 }
9946         }
9947
9948         ret = perf_event_attach_bpf_prog(event, prog);
9949         if (ret)
9950                 bpf_prog_put(prog);
9951         return ret;
9952 }
9953
9954 static void perf_event_free_bpf_prog(struct perf_event *event)
9955 {
9956         if (!perf_event_is_tracing(event)) {
9957                 perf_event_free_bpf_handler(event);
9958                 return;
9959         }
9960         perf_event_detach_bpf_prog(event);
9961 }
9962
9963 #else
9964
9965 static inline void perf_tp_register(void)
9966 {
9967 }
9968
9969 static void perf_event_free_filter(struct perf_event *event)
9970 {
9971 }
9972
9973 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
9974 {
9975         return -ENOENT;
9976 }
9977
9978 static void perf_event_free_bpf_prog(struct perf_event *event)
9979 {
9980 }
9981 #endif /* CONFIG_EVENT_TRACING */
9982
9983 #ifdef CONFIG_HAVE_HW_BREAKPOINT
9984 void perf_bp_event(struct perf_event *bp, void *data)
9985 {
9986         struct perf_sample_data sample;
9987         struct pt_regs *regs = data;
9988
9989         perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
9990
9991         if (!bp->hw.state && !perf_exclude_event(bp, regs))
9992                 perf_swevent_event(bp, 1, &sample, regs);
9993 }
9994 #endif
9995
9996 /*
9997  * Allocate a new address filter
9998  */
9999 static struct perf_addr_filter *
10000 perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
10001 {
10002         int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
10003         struct perf_addr_filter *filter;
10004
10005         filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
10006         if (!filter)
10007                 return NULL;
10008
10009         INIT_LIST_HEAD(&filter->entry);
10010         list_add_tail(&filter->entry, filters);
10011
10012         return filter;
10013 }
10014
10015 static void free_filters_list(struct list_head *filters)
10016 {
10017         struct perf_addr_filter *filter, *iter;
10018
10019         list_for_each_entry_safe(filter, iter, filters, entry) {
10020                 path_put(&filter->path);
10021                 list_del(&filter->entry);
10022                 kfree(filter);
10023         }
10024 }
10025
10026 /*
10027  * Free existing address filters and optionally install new ones
10028  */
10029 static void perf_addr_filters_splice(struct perf_event *event,
10030                                      struct list_head *head)
10031 {
10032         unsigned long flags;
10033         LIST_HEAD(list);
10034
10035         if (!has_addr_filter(event))
10036                 return;
10037
10038         /* don't bother with children, they don't have their own filters */
10039         if (event->parent)
10040                 return;
10041
10042         raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
10043
10044         list_splice_init(&event->addr_filters.list, &list);
10045         if (head)
10046                 list_splice(head, &event->addr_filters.list);
10047
10048         raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
10049
10050         free_filters_list(&list);
10051 }
10052
10053 /*
10054  * Scan through mm's vmas and see if one of them matches the
10055  * @filter; if so, adjust filter's address range.
10056  * Called with mm::mmap_lock down for reading.
10057  */
10058 static void perf_addr_filter_apply(struct perf_addr_filter *filter,
10059                                    struct mm_struct *mm,
10060                                    struct perf_addr_filter_range *fr)
10061 {
10062         struct vm_area_struct *vma;
10063
10064         for (vma = mm->mmap; vma; vma = vma->vm_next) {
10065                 if (!vma->vm_file)
10066                         continue;
10067
10068                 if (perf_addr_filter_vma_adjust(filter, vma, fr))
10069                         return;
10070         }
10071 }
10072
10073 /*
10074  * Update event's address range filters based on the
10075  * task's existing mappings, if any.
10076  */
10077 static void perf_event_addr_filters_apply(struct perf_event *event)
10078 {
10079         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
10080         struct task_struct *task = READ_ONCE(event->ctx->task);
10081         struct perf_addr_filter *filter;
10082         struct mm_struct *mm = NULL;
10083         unsigned int count = 0;
10084         unsigned long flags;
10085
10086         /*
10087          * We may observe TASK_TOMBSTONE, which means that the event tear-down
10088          * will stop on the parent's child_mutex that our caller is also holding
10089          */
10090         if (task == TASK_TOMBSTONE)
10091                 return;
10092
10093         if (ifh->nr_file_filters) {
10094                 mm = get_task_mm(event->ctx->task);
10095                 if (!mm)
10096                         goto restart;
10097
10098                 mmap_read_lock(mm);
10099         }
10100
10101         raw_spin_lock_irqsave(&ifh->lock, flags);
10102         list_for_each_entry(filter, &ifh->list, entry) {
10103                 if (filter->path.dentry) {
10104                         /*
10105                          * Adjust base offset if the filter is associated to a
10106                          * binary that needs to be mapped:
10107                          */
10108                         event->addr_filter_ranges[count].start = 0;
10109                         event->addr_filter_ranges[count].size = 0;
10110
10111                         perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
10112                 } else {
10113                         event->addr_filter_ranges[count].start = filter->offset;
10114                         event->addr_filter_ranges[count].size  = filter->size;
10115                 }
10116
10117                 count++;
10118         }
10119
10120         event->addr_filters_gen++;
10121         raw_spin_unlock_irqrestore(&ifh->lock, flags);
10122
10123         if (ifh->nr_file_filters) {
10124                 mmap_read_unlock(mm);
10125
10126                 mmput(mm);
10127         }
10128
10129 restart:
10130         perf_event_stop(event, 1);
10131 }
10132
10133 /*
10134  * Address range filtering: limiting the data to certain
10135  * instruction address ranges. Filters are ioctl()ed to us from
10136  * userspace as ascii strings.
10137  *
10138  * Filter string format:
10139  *
10140  * ACTION RANGE_SPEC
10141  * where ACTION is one of the
10142  *  * "filter": limit the trace to this region
10143  *  * "start": start tracing from this address
10144  *  * "stop": stop tracing at this address/region;
10145  * RANGE_SPEC is
10146  *  * for kernel addresses: <start address>[/<size>]
10147  *  * for object files:     <start address>[/<size>]@</path/to/object/file>
10148  *
10149  * if <size> is not specified or is zero, the range is treated as a single
10150  * address; not valid for ACTION=="filter".
10151  */
10152 enum {
10153         IF_ACT_NONE = -1,
10154         IF_ACT_FILTER,
10155         IF_ACT_START,
10156         IF_ACT_STOP,
10157         IF_SRC_FILE,
10158         IF_SRC_KERNEL,
10159         IF_SRC_FILEADDR,
10160         IF_SRC_KERNELADDR,
10161 };
10162
10163 enum {
10164         IF_STATE_ACTION = 0,
10165         IF_STATE_SOURCE,
10166         IF_STATE_END,
10167 };
10168
10169 static const match_table_t if_tokens = {
10170         { IF_ACT_FILTER,        "filter" },
10171         { IF_ACT_START,         "start" },
10172         { IF_ACT_STOP,          "stop" },
10173         { IF_SRC_FILE,          "%u/%u@%s" },
10174         { IF_SRC_KERNEL,        "%u/%u" },
10175         { IF_SRC_FILEADDR,      "%u@%s" },
10176         { IF_SRC_KERNELADDR,    "%u" },
10177         { IF_ACT_NONE,          NULL },
10178 };
10179
10180 /*
10181  * Address filter string parser
10182  */
10183 static int
10184 perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
10185                              struct list_head *filters)
10186 {
10187         struct perf_addr_filter *filter = NULL;
10188         char *start, *orig, *filename = NULL;
10189         substring_t args[MAX_OPT_ARGS];
10190         int state = IF_STATE_ACTION, token;
10191         unsigned int kernel = 0;
10192         int ret = -EINVAL;
10193
10194         orig = fstr = kstrdup(fstr, GFP_KERNEL);
10195         if (!fstr)
10196                 return -ENOMEM;
10197
10198         while ((start = strsep(&fstr, " ,\n")) != NULL) {
10199                 static const enum perf_addr_filter_action_t actions[] = {
10200                         [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
10201                         [IF_ACT_START]  = PERF_ADDR_FILTER_ACTION_START,
10202                         [IF_ACT_STOP]   = PERF_ADDR_FILTER_ACTION_STOP,
10203                 };
10204                 ret = -EINVAL;
10205
10206                 if (!*start)
10207                         continue;
10208
10209                 /* filter definition begins */
10210                 if (state == IF_STATE_ACTION) {
10211                         filter = perf_addr_filter_new(event, filters);
10212                         if (!filter)
10213                                 goto fail;
10214                 }
10215
10216                 token = match_token(start, if_tokens, args);
10217                 switch (token) {
10218                 case IF_ACT_FILTER:
10219                 case IF_ACT_START:
10220                 case IF_ACT_STOP:
10221                         if (state != IF_STATE_ACTION)
10222                                 goto fail;
10223
10224                         filter->action = actions[token];
10225                         state = IF_STATE_SOURCE;
10226                         break;
10227
10228                 case IF_SRC_KERNELADDR:
10229                 case IF_SRC_KERNEL:
10230                         kernel = 1;
10231                         fallthrough;
10232
10233                 case IF_SRC_FILEADDR:
10234                 case IF_SRC_FILE:
10235                         if (state != IF_STATE_SOURCE)
10236                                 goto fail;
10237
10238                         *args[0].to = 0;
10239                         ret = kstrtoul(args[0].from, 0, &filter->offset);
10240                         if (ret)
10241                                 goto fail;
10242
10243                         if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
10244                                 *args[1].to = 0;
10245                                 ret = kstrtoul(args[1].from, 0, &filter->size);
10246                                 if (ret)
10247                                         goto fail;
10248                         }
10249
10250                         if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
10251                                 int fpos = token == IF_SRC_FILE ? 2 : 1;
10252
10253                                 kfree(filename);
10254                                 filename = match_strdup(&args[fpos]);
10255                                 if (!filename) {
10256                                         ret = -ENOMEM;
10257                                         goto fail;
10258                                 }
10259                         }
10260
10261                         state = IF_STATE_END;
10262                         break;
10263
10264                 default:
10265                         goto fail;
10266                 }
10267
10268                 /*
10269                  * Filter definition is fully parsed, validate and install it.
10270                  * Make sure that it doesn't contradict itself or the event's
10271                  * attribute.
10272                  */
10273                 if (state == IF_STATE_END) {
10274                         ret = -EINVAL;
10275                         if (kernel && event->attr.exclude_kernel)
10276                                 goto fail;
10277
10278                         /*
10279                          * ACTION "filter" must have a non-zero length region
10280                          * specified.
10281                          */
10282                         if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
10283                             !filter->size)
10284                                 goto fail;
10285
10286                         if (!kernel) {
10287                                 if (!filename)
10288                                         goto fail;
10289
10290                                 /*
10291                                  * For now, we only support file-based filters
10292                                  * in per-task events; doing so for CPU-wide
10293                                  * events requires additional context switching
10294                                  * trickery, since same object code will be
10295                                  * mapped at different virtual addresses in
10296                                  * different processes.
10297                                  */
10298                                 ret = -EOPNOTSUPP;
10299                                 if (!event->ctx->task)
10300                                         goto fail;
10301
10302                                 /* look up the path and grab its inode */
10303                                 ret = kern_path(filename, LOOKUP_FOLLOW,
10304                                                 &filter->path);
10305                                 if (ret)
10306                                         goto fail;
10307
10308                                 ret = -EINVAL;
10309                                 if (!filter->path.dentry ||
10310                                     !S_ISREG(d_inode(filter->path.dentry)
10311                                              ->i_mode))
10312                                         goto fail;
10313
10314                                 event->addr_filters.nr_file_filters++;
10315                         }
10316
10317                         /* ready to consume more filters */
10318                         state = IF_STATE_ACTION;
10319                         filter = NULL;
10320                 }
10321         }
10322
10323         if (state != IF_STATE_ACTION)
10324                 goto fail;
10325
10326         kfree(filename);
10327         kfree(orig);
10328
10329         return 0;
10330
10331 fail:
10332         kfree(filename);
10333         free_filters_list(filters);
10334         kfree(orig);
10335
10336         return ret;
10337 }
10338
10339 static int
10340 perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
10341 {
10342         LIST_HEAD(filters);
10343         int ret;
10344
10345         /*
10346          * Since this is called in perf_ioctl() path, we're already holding
10347          * ctx::mutex.
10348          */
10349         lockdep_assert_held(&event->ctx->mutex);
10350
10351         if (WARN_ON_ONCE(event->parent))
10352                 return -EINVAL;
10353
10354         ret = perf_event_parse_addr_filter(event, filter_str, &filters);
10355         if (ret)
10356                 goto fail_clear_files;
10357
10358         ret = event->pmu->addr_filters_validate(&filters);
10359         if (ret)
10360                 goto fail_free_filters;
10361
10362         /* remove existing filters, if any */
10363         perf_addr_filters_splice(event, &filters);
10364
10365         /* install new filters */
10366         perf_event_for_each_child(event, perf_event_addr_filters_apply);
10367
10368         return ret;
10369
10370 fail_free_filters:
10371         free_filters_list(&filters);
10372
10373 fail_clear_files:
10374         event->addr_filters.nr_file_filters = 0;
10375
10376         return ret;
10377 }
10378
10379 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
10380 {
10381         int ret = -EINVAL;
10382         char *filter_str;
10383
10384         filter_str = strndup_user(arg, PAGE_SIZE);
10385         if (IS_ERR(filter_str))
10386                 return PTR_ERR(filter_str);
10387
10388 #ifdef CONFIG_EVENT_TRACING
10389         if (perf_event_is_tracing(event)) {
10390                 struct perf_event_context *ctx = event->ctx;
10391
10392                 /*
10393                  * Beware, here be dragons!!
10394                  *
10395                  * the tracepoint muck will deadlock against ctx->mutex, but
10396                  * the tracepoint stuff does not actually need it. So
10397                  * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
10398                  * already have a reference on ctx.
10399                  *
10400                  * This can result in event getting moved to a different ctx,
10401                  * but that does not affect the tracepoint state.
10402                  */
10403                 mutex_unlock(&ctx->mutex);
10404                 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
10405                 mutex_lock(&ctx->mutex);
10406         } else
10407 #endif
10408         if (has_addr_filter(event))
10409                 ret = perf_event_set_addr_filter(event, filter_str);
10410
10411         kfree(filter_str);
10412         return ret;
10413 }
10414
10415 /*
10416  * hrtimer based swevent callback
10417  */
10418
10419 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
10420 {
10421         enum hrtimer_restart ret = HRTIMER_RESTART;
10422         struct perf_sample_data data;
10423         struct pt_regs *regs;
10424         struct perf_event *event;
10425         u64 period;
10426
10427         event = container_of(hrtimer, struct perf_event, hw.hrtimer);
10428
10429         if (event->state != PERF_EVENT_STATE_ACTIVE)
10430                 return HRTIMER_NORESTART;
10431
10432         event->pmu->read(event);
10433
10434         perf_sample_data_init(&data, 0, event->hw.last_period);
10435         regs = get_irq_regs();
10436
10437         if (regs && !perf_exclude_event(event, regs)) {
10438                 if (!(event->attr.exclude_idle && is_idle_task(current)))
10439                         if (__perf_event_overflow(event, 1, &data, regs))
10440                                 ret = HRTIMER_NORESTART;
10441         }
10442
10443         period = max_t(u64, 10000, event->hw.sample_period);
10444         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
10445
10446         return ret;
10447 }
10448
10449 static void perf_swevent_start_hrtimer(struct perf_event *event)
10450 {
10451         struct hw_perf_event *hwc = &event->hw;
10452         s64 period;
10453
10454         if (!is_sampling_event(event))
10455                 return;
10456
10457         period = local64_read(&hwc->period_left);
10458         if (period) {
10459                 if (period < 0)
10460                         period = 10000;
10461
10462                 local64_set(&hwc->period_left, 0);
10463         } else {
10464                 period = max_t(u64, 10000, hwc->sample_period);
10465         }
10466         hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
10467                       HRTIMER_MODE_REL_PINNED_HARD);
10468 }
10469
10470 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
10471 {
10472         struct hw_perf_event *hwc = &event->hw;
10473
10474         if (is_sampling_event(event)) {
10475                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
10476                 local64_set(&hwc->period_left, ktime_to_ns(remaining));
10477
10478                 hrtimer_cancel(&hwc->hrtimer);
10479         }
10480 }
10481
10482 static void perf_swevent_init_hrtimer(struct perf_event *event)
10483 {
10484         struct hw_perf_event *hwc = &event->hw;
10485
10486         if (!is_sampling_event(event))
10487                 return;
10488
10489         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
10490         hwc->hrtimer.function = perf_swevent_hrtimer;
10491
10492         /*
10493          * Since hrtimers have a fixed rate, we can do a static freq->period
10494          * mapping and avoid the whole period adjust feedback stuff.
10495          */
10496         if (event->attr.freq) {
10497                 long freq = event->attr.sample_freq;
10498
10499                 event->attr.sample_period = NSEC_PER_SEC / freq;
10500                 hwc->sample_period = event->attr.sample_period;
10501                 local64_set(&hwc->period_left, hwc->sample_period);
10502                 hwc->last_period = hwc->sample_period;
10503                 event->attr.freq = 0;
10504         }
10505 }
10506
10507 /*
10508  * Software event: cpu wall time clock
10509  */
10510
10511 static void cpu_clock_event_update(struct perf_event *event)
10512 {
10513         s64 prev;
10514         u64 now;
10515
10516         now = local_clock();
10517         prev = local64_xchg(&event->hw.prev_count, now);
10518         local64_add(now - prev, &event->count);
10519 }
10520
10521 static void cpu_clock_event_start(struct perf_event *event, int flags)
10522 {
10523         local64_set(&event->hw.prev_count, local_clock());
10524         perf_swevent_start_hrtimer(event);
10525 }
10526
10527 static void cpu_clock_event_stop(struct perf_event *event, int flags)
10528 {
10529         perf_swevent_cancel_hrtimer(event);
10530         cpu_clock_event_update(event);
10531 }
10532
10533 static int cpu_clock_event_add(struct perf_event *event, int flags)
10534 {
10535         if (flags & PERF_EF_START)
10536                 cpu_clock_event_start(event, flags);
10537         perf_event_update_userpage(event);
10538
10539         return 0;
10540 }
10541
10542 static void cpu_clock_event_del(struct perf_event *event, int flags)
10543 {
10544         cpu_clock_event_stop(event, flags);
10545 }
10546
10547 static void cpu_clock_event_read(struct perf_event *event)
10548 {
10549         cpu_clock_event_update(event);
10550 }
10551
10552 static int cpu_clock_event_init(struct perf_event *event)
10553 {
10554         if (event->attr.type != PERF_TYPE_SOFTWARE)
10555                 return -ENOENT;
10556
10557         if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
10558                 return -ENOENT;
10559
10560         /*
10561          * no branch sampling for software events
10562          */
10563         if (has_branch_stack(event))
10564                 return -EOPNOTSUPP;
10565
10566         perf_swevent_init_hrtimer(event);
10567
10568         return 0;
10569 }
10570
10571 static struct pmu perf_cpu_clock = {
10572         .task_ctx_nr    = perf_sw_context,
10573
10574         .capabilities   = PERF_PMU_CAP_NO_NMI,
10575
10576         .event_init     = cpu_clock_event_init,
10577         .add            = cpu_clock_event_add,
10578         .del            = cpu_clock_event_del,
10579         .start          = cpu_clock_event_start,
10580         .stop           = cpu_clock_event_stop,
10581         .read           = cpu_clock_event_read,
10582 };
10583
10584 /*
10585  * Software event: task time clock
10586  */
10587
10588 static void task_clock_event_update(struct perf_event *event, u64 now)
10589 {
10590         u64 prev;
10591         s64 delta;
10592
10593         prev = local64_xchg(&event->hw.prev_count, now);
10594         delta = now - prev;
10595         local64_add(delta, &event->count);
10596 }
10597
10598 static void task_clock_event_start(struct perf_event *event, int flags)
10599 {
10600         local64_set(&event->hw.prev_count, event->ctx->time);
10601         perf_swevent_start_hrtimer(event);
10602 }
10603
10604 static void task_clock_event_stop(struct perf_event *event, int flags)
10605 {
10606         perf_swevent_cancel_hrtimer(event);
10607         task_clock_event_update(event, event->ctx->time);
10608 }
10609
10610 static int task_clock_event_add(struct perf_event *event, int flags)
10611 {
10612         if (flags & PERF_EF_START)
10613                 task_clock_event_start(event, flags);
10614         perf_event_update_userpage(event);
10615
10616         return 0;
10617 }
10618
10619 static void task_clock_event_del(struct perf_event *event, int flags)
10620 {
10621         task_clock_event_stop(event, PERF_EF_UPDATE);
10622 }
10623
10624 static void task_clock_event_read(struct perf_event *event)
10625 {
10626         u64 now = perf_clock();
10627         u64 delta = now - event->ctx->timestamp;
10628         u64 time = event->ctx->time + delta;
10629
10630         task_clock_event_update(event, time);
10631 }
10632
10633 static int task_clock_event_init(struct perf_event *event)
10634 {
10635         if (event->attr.type != PERF_TYPE_SOFTWARE)
10636                 return -ENOENT;
10637
10638         if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
10639                 return -ENOENT;
10640
10641         /*
10642          * no branch sampling for software events
10643          */
10644         if (has_branch_stack(event))
10645                 return -EOPNOTSUPP;
10646
10647         perf_swevent_init_hrtimer(event);
10648
10649         return 0;
10650 }
10651
10652 static struct pmu perf_task_clock = {
10653         .task_ctx_nr    = perf_sw_context,
10654
10655         .capabilities   = PERF_PMU_CAP_NO_NMI,
10656
10657         .event_init     = task_clock_event_init,
10658         .add            = task_clock_event_add,
10659         .del            = task_clock_event_del,
10660         .start          = task_clock_event_start,
10661         .stop           = task_clock_event_stop,
10662         .read           = task_clock_event_read,
10663 };
10664
10665 static void perf_pmu_nop_void(struct pmu *pmu)
10666 {
10667 }
10668
10669 static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
10670 {
10671 }
10672
10673 static int perf_pmu_nop_int(struct pmu *pmu)
10674 {
10675         return 0;
10676 }
10677
10678 static int perf_event_nop_int(struct perf_event *event, u64 value)
10679 {
10680         return 0;
10681 }
10682
10683 static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
10684
10685 static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
10686 {
10687         __this_cpu_write(nop_txn_flags, flags);
10688
10689         if (flags & ~PERF_PMU_TXN_ADD)
10690                 return;
10691
10692         perf_pmu_disable(pmu);
10693 }
10694
10695 static int perf_pmu_commit_txn(struct pmu *pmu)
10696 {
10697         unsigned int flags = __this_cpu_read(nop_txn_flags);
10698
10699         __this_cpu_write(nop_txn_flags, 0);
10700
10701         if (flags & ~PERF_PMU_TXN_ADD)
10702                 return 0;
10703
10704         perf_pmu_enable(pmu);
10705         return 0;
10706 }
10707
10708 static void perf_pmu_cancel_txn(struct pmu *pmu)
10709 {
10710         unsigned int flags =  __this_cpu_read(nop_txn_flags);
10711
10712         __this_cpu_write(nop_txn_flags, 0);
10713
10714         if (flags & ~PERF_PMU_TXN_ADD)
10715                 return;
10716
10717         perf_pmu_enable(pmu);
10718 }
10719
10720 static int perf_event_idx_default(struct perf_event *event)
10721 {
10722         return 0;
10723 }
10724
10725 /*
10726  * Ensures all contexts with the same task_ctx_nr have the same
10727  * pmu_cpu_context too.
10728  */
10729 static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
10730 {
10731         struct pmu *pmu;
10732
10733         if (ctxn < 0)
10734                 return NULL;
10735
10736         list_for_each_entry(pmu, &pmus, entry) {
10737                 if (pmu->task_ctx_nr == ctxn)
10738                         return pmu->pmu_cpu_context;
10739         }
10740
10741         return NULL;
10742 }
10743
10744 static void free_pmu_context(struct pmu *pmu)
10745 {
10746         /*
10747          * Static contexts such as perf_sw_context have a global lifetime
10748          * and may be shared between different PMUs. Avoid freeing them
10749          * when a single PMU is going away.
10750          */
10751         if (pmu->task_ctx_nr > perf_invalid_context)
10752                 return;
10753
10754         free_percpu(pmu->pmu_cpu_context);
10755 }
10756
10757 /*
10758  * Let userspace know that this PMU supports address range filtering:
10759  */
10760 static ssize_t nr_addr_filters_show(struct device *dev,
10761                                     struct device_attribute *attr,
10762                                     char *page)
10763 {
10764         struct pmu *pmu = dev_get_drvdata(dev);
10765
10766         return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
10767 }
10768 DEVICE_ATTR_RO(nr_addr_filters);
10769
10770 static struct idr pmu_idr;
10771
10772 static ssize_t
10773 type_show(struct device *dev, struct device_attribute *attr, char *page)
10774 {
10775         struct pmu *pmu = dev_get_drvdata(dev);
10776
10777         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
10778 }
10779 static DEVICE_ATTR_RO(type);
10780
10781 static ssize_t
10782 perf_event_mux_interval_ms_show(struct device *dev,
10783                                 struct device_attribute *attr,
10784                                 char *page)
10785 {
10786         struct pmu *pmu = dev_get_drvdata(dev);
10787
10788         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
10789 }
10790
10791 static DEFINE_MUTEX(mux_interval_mutex);
10792
10793 static ssize_t
10794 perf_event_mux_interval_ms_store(struct device *dev,
10795                                  struct device_attribute *attr,
10796                                  const char *buf, size_t count)
10797 {
10798         struct pmu *pmu = dev_get_drvdata(dev);
10799         int timer, cpu, ret;
10800
10801         ret = kstrtoint(buf, 0, &timer);
10802         if (ret)
10803                 return ret;
10804
10805         if (timer < 1)
10806                 return -EINVAL;
10807
10808         /* same value, noting to do */
10809         if (timer == pmu->hrtimer_interval_ms)
10810                 return count;
10811
10812         mutex_lock(&mux_interval_mutex);
10813         pmu->hrtimer_interval_ms = timer;
10814
10815         /* update all cpuctx for this PMU */
10816         cpus_read_lock();
10817         for_each_online_cpu(cpu) {
10818                 struct perf_cpu_context *cpuctx;
10819                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
10820                 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
10821
10822                 cpu_function_call(cpu,
10823                         (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
10824         }
10825         cpus_read_unlock();
10826         mutex_unlock(&mux_interval_mutex);
10827
10828         return count;
10829 }
10830 static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
10831
10832 static struct attribute *pmu_dev_attrs[] = {
10833         &dev_attr_type.attr,
10834         &dev_attr_perf_event_mux_interval_ms.attr,
10835         NULL,
10836 };
10837 ATTRIBUTE_GROUPS(pmu_dev);
10838
10839 static int pmu_bus_running;
10840 static struct bus_type pmu_bus = {
10841         .name           = "event_source",
10842         .dev_groups     = pmu_dev_groups,
10843 };
10844
10845 static void pmu_dev_release(struct device *dev)
10846 {
10847         kfree(dev);
10848 }
10849
10850 static int pmu_dev_alloc(struct pmu *pmu)
10851 {
10852         int ret = -ENOMEM;
10853
10854         pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
10855         if (!pmu->dev)
10856                 goto out;
10857
10858         pmu->dev->groups = pmu->attr_groups;
10859         device_initialize(pmu->dev);
10860         ret = dev_set_name(pmu->dev, "%s", pmu->name);
10861         if (ret)
10862                 goto free_dev;
10863
10864         dev_set_drvdata(pmu->dev, pmu);
10865         pmu->dev->bus = &pmu_bus;
10866         pmu->dev->release = pmu_dev_release;
10867         ret = device_add(pmu->dev);
10868         if (ret)
10869                 goto free_dev;
10870
10871         /* For PMUs with address filters, throw in an extra attribute: */
10872         if (pmu->nr_addr_filters)
10873                 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
10874
10875         if (ret)
10876                 goto del_dev;
10877
10878         if (pmu->attr_update)
10879                 ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
10880
10881         if (ret)
10882                 goto del_dev;
10883
10884 out:
10885         return ret;
10886
10887 del_dev:
10888         device_del(pmu->dev);
10889
10890 free_dev:
10891         put_device(pmu->dev);
10892         goto out;
10893 }
10894
10895 static struct lock_class_key cpuctx_mutex;
10896 static struct lock_class_key cpuctx_lock;
10897
10898 int perf_pmu_register(struct pmu *pmu, const char *name, int type)
10899 {
10900         int cpu, ret, max = PERF_TYPE_MAX;
10901
10902         mutex_lock(&pmus_lock);
10903         ret = -ENOMEM;
10904         pmu->pmu_disable_count = alloc_percpu(int);
10905         if (!pmu->pmu_disable_count)
10906                 goto unlock;
10907
10908         pmu->type = -1;
10909         if (!name)
10910                 goto skip_type;
10911         pmu->name = name;
10912
10913         if (type != PERF_TYPE_SOFTWARE) {
10914                 if (type >= 0)
10915                         max = type;
10916
10917                 ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
10918                 if (ret < 0)
10919                         goto free_pdc;
10920
10921                 WARN_ON(type >= 0 && ret != type);
10922
10923                 type = ret;
10924         }
10925         pmu->type = type;
10926
10927         if (pmu_bus_running) {
10928                 ret = pmu_dev_alloc(pmu);
10929                 if (ret)
10930                         goto free_idr;
10931         }
10932
10933 skip_type:
10934         if (pmu->task_ctx_nr == perf_hw_context) {
10935                 static int hw_context_taken = 0;
10936
10937                 /*
10938                  * Other than systems with heterogeneous CPUs, it never makes
10939                  * sense for two PMUs to share perf_hw_context. PMUs which are
10940                  * uncore must use perf_invalid_context.
10941                  */
10942                 if (WARN_ON_ONCE(hw_context_taken &&
10943                     !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
10944                         pmu->task_ctx_nr = perf_invalid_context;
10945
10946                 hw_context_taken = 1;
10947         }
10948
10949         pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
10950         if (pmu->pmu_cpu_context)
10951                 goto got_cpu_context;
10952
10953         ret = -ENOMEM;
10954         pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
10955         if (!pmu->pmu_cpu_context)
10956                 goto free_dev;
10957
10958         for_each_possible_cpu(cpu) {
10959                 struct perf_cpu_context *cpuctx;
10960
10961                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
10962                 __perf_event_init_context(&cpuctx->ctx);
10963                 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
10964                 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
10965                 cpuctx->ctx.pmu = pmu;
10966                 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
10967
10968                 __perf_mux_hrtimer_init(cpuctx, cpu);
10969
10970                 cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
10971                 cpuctx->heap = cpuctx->heap_default;
10972         }
10973
10974 got_cpu_context:
10975         if (!pmu->start_txn) {
10976                 if (pmu->pmu_enable) {
10977                         /*
10978                          * If we have pmu_enable/pmu_disable calls, install
10979                          * transaction stubs that use that to try and batch
10980                          * hardware accesses.
10981                          */
10982                         pmu->start_txn  = perf_pmu_start_txn;
10983                         pmu->commit_txn = perf_pmu_commit_txn;
10984                         pmu->cancel_txn = perf_pmu_cancel_txn;
10985                 } else {
10986                         pmu->start_txn  = perf_pmu_nop_txn;
10987                         pmu->commit_txn = perf_pmu_nop_int;
10988                         pmu->cancel_txn = perf_pmu_nop_void;
10989                 }
10990         }
10991
10992         if (!pmu->pmu_enable) {
10993                 pmu->pmu_enable  = perf_pmu_nop_void;
10994                 pmu->pmu_disable = perf_pmu_nop_void;
10995         }
10996
10997         if (!pmu->check_period)
10998                 pmu->check_period = perf_event_nop_int;
10999
11000         if (!pmu->event_idx)
11001                 pmu->event_idx = perf_event_idx_default;
11002
11003         /*
11004          * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
11005          * since these cannot be in the IDR. This way the linear search
11006          * is fast, provided a valid software event is provided.
11007          */
11008         if (type == PERF_TYPE_SOFTWARE || !name)
11009                 list_add_rcu(&pmu->entry, &pmus);
11010         else
11011                 list_add_tail_rcu(&pmu->entry, &pmus);
11012
11013         atomic_set(&pmu->exclusive_cnt, 0);
11014         ret = 0;
11015 unlock:
11016         mutex_unlock(&pmus_lock);
11017
11018         return ret;
11019
11020 free_dev:
11021         device_del(pmu->dev);
11022         put_device(pmu->dev);
11023
11024 free_idr:
11025         if (pmu->type != PERF_TYPE_SOFTWARE)
11026                 idr_remove(&pmu_idr, pmu->type);
11027
11028 free_pdc:
11029         free_percpu(pmu->pmu_disable_count);
11030         goto unlock;
11031 }
11032 EXPORT_SYMBOL_GPL(perf_pmu_register);
11033
11034 void perf_pmu_unregister(struct pmu *pmu)
11035 {
11036         mutex_lock(&pmus_lock);
11037         list_del_rcu(&pmu->entry);
11038
11039         /*
11040          * We dereference the pmu list under both SRCU and regular RCU, so
11041          * synchronize against both of those.
11042          */
11043         synchronize_srcu(&pmus_srcu);
11044         synchronize_rcu();
11045
11046         free_percpu(pmu->pmu_disable_count);
11047         if (pmu->type != PERF_TYPE_SOFTWARE)
11048                 idr_remove(&pmu_idr, pmu->type);
11049         if (pmu_bus_running) {
11050                 if (pmu->nr_addr_filters)
11051                         device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
11052                 device_del(pmu->dev);
11053                 put_device(pmu->dev);
11054         }
11055         free_pmu_context(pmu);
11056         mutex_unlock(&pmus_lock);
11057 }
11058 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
11059
11060 static inline bool has_extended_regs(struct perf_event *event)
11061 {
11062         return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
11063                (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
11064 }
11065
11066 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
11067 {
11068         struct perf_event_context *ctx = NULL;
11069         int ret;
11070
11071         if (!try_module_get(pmu->module))
11072                 return -ENODEV;
11073
11074         /*
11075          * A number of pmu->event_init() methods iterate the sibling_list to,
11076          * for example, validate if the group fits on the PMU. Therefore,
11077          * if this is a sibling event, acquire the ctx->mutex to protect
11078          * the sibling_list.
11079          */
11080         if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
11081                 /*
11082                  * This ctx->mutex can nest when we're called through
11083                  * inheritance. See the perf_event_ctx_lock_nested() comment.
11084                  */
11085                 ctx = perf_event_ctx_lock_nested(event->group_leader,
11086                                                  SINGLE_DEPTH_NESTING);
11087                 BUG_ON(!ctx);
11088         }
11089
11090         event->pmu = pmu;
11091         ret = pmu->event_init(event);
11092
11093         if (ctx)
11094                 perf_event_ctx_unlock(event->group_leader, ctx);
11095
11096         if (!ret) {
11097                 if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
11098                     has_extended_regs(event))
11099                         ret = -EOPNOTSUPP;
11100
11101                 if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
11102                     event_has_any_exclude_flag(event))
11103                         ret = -EINVAL;
11104
11105                 if (ret && event->destroy)
11106                         event->destroy(event);
11107         }
11108
11109         if (ret)
11110                 module_put(pmu->module);
11111
11112         return ret;
11113 }
11114
11115 static struct pmu *perf_init_event(struct perf_event *event)
11116 {
11117         int idx, type, ret;
11118         struct pmu *pmu;
11119
11120         idx = srcu_read_lock(&pmus_srcu);
11121
11122         /* Try parent's PMU first: */
11123         if (event->parent && event->parent->pmu) {
11124                 pmu = event->parent->pmu;
11125                 ret = perf_try_init_event(pmu, event);
11126                 if (!ret)
11127                         goto unlock;
11128         }
11129
11130         /*
11131          * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
11132          * are often aliases for PERF_TYPE_RAW.
11133          */
11134         type = event->attr.type;
11135         if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
11136                 type = PERF_TYPE_RAW;
11137
11138 again:
11139         rcu_read_lock();
11140         pmu = idr_find(&pmu_idr, type);
11141         rcu_read_unlock();
11142         if (pmu) {
11143                 ret = perf_try_init_event(pmu, event);
11144                 if (ret == -ENOENT && event->attr.type != type) {
11145                         type = event->attr.type;
11146                         goto again;
11147                 }
11148
11149                 if (ret)
11150                         pmu = ERR_PTR(ret);
11151
11152                 goto unlock;
11153         }
11154
11155         list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
11156                 ret = perf_try_init_event(pmu, event);
11157                 if (!ret)
11158                         goto unlock;
11159
11160                 if (ret != -ENOENT) {
11161                         pmu = ERR_PTR(ret);
11162                         goto unlock;
11163                 }
11164         }
11165         pmu = ERR_PTR(-ENOENT);
11166 unlock:
11167         srcu_read_unlock(&pmus_srcu, idx);
11168
11169         return pmu;
11170 }
11171
11172 static void attach_sb_event(struct perf_event *event)
11173 {
11174         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
11175
11176         raw_spin_lock(&pel->lock);
11177         list_add_rcu(&event->sb_list, &pel->list);
11178         raw_spin_unlock(&pel->lock);
11179 }
11180
11181 /*
11182  * We keep a list of all !task (and therefore per-cpu) events
11183  * that need to receive side-band records.
11184  *
11185  * This avoids having to scan all the various PMU per-cpu contexts
11186  * looking for them.
11187  */
11188 static void account_pmu_sb_event(struct perf_event *event)
11189 {
11190         if (is_sb_event(event))
11191                 attach_sb_event(event);
11192 }
11193
11194 static void account_event_cpu(struct perf_event *event, int cpu)
11195 {
11196         if (event->parent)
11197                 return;
11198
11199         if (is_cgroup_event(event))
11200                 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
11201 }
11202
11203 /* Freq events need the tick to stay alive (see perf_event_task_tick). */
11204 static void account_freq_event_nohz(void)
11205 {
11206 #ifdef CONFIG_NO_HZ_FULL
11207         /* Lock so we don't race with concurrent unaccount */
11208         spin_lock(&nr_freq_lock);
11209         if (atomic_inc_return(&nr_freq_events) == 1)
11210                 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
11211         spin_unlock(&nr_freq_lock);
11212 #endif
11213 }
11214
11215 static void account_freq_event(void)
11216 {
11217         if (tick_nohz_full_enabled())
11218                 account_freq_event_nohz();
11219         else
11220                 atomic_inc(&nr_freq_events);
11221 }
11222
11223
11224 static void account_event(struct perf_event *event)
11225 {
11226         bool inc = false;
11227
11228         if (event->parent)
11229                 return;
11230
11231         if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
11232                 inc = true;
11233         if (event->attr.mmap || event->attr.mmap_data)
11234                 atomic_inc(&nr_mmap_events);
11235         if (event->attr.build_id)
11236                 atomic_inc(&nr_build_id_events);
11237         if (event->attr.comm)
11238                 atomic_inc(&nr_comm_events);
11239         if (event->attr.namespaces)
11240                 atomic_inc(&nr_namespaces_events);
11241         if (event->attr.cgroup)
11242                 atomic_inc(&nr_cgroup_events);
11243         if (event->attr.task)
11244                 atomic_inc(&nr_task_events);
11245         if (event->attr.freq)
11246                 account_freq_event();
11247         if (event->attr.context_switch) {
11248                 atomic_inc(&nr_switch_events);
11249                 inc = true;
11250         }
11251         if (has_branch_stack(event))
11252                 inc = true;
11253         if (is_cgroup_event(event))
11254                 inc = true;
11255         if (event->attr.ksymbol)
11256                 atomic_inc(&nr_ksymbol_events);
11257         if (event->attr.bpf_event)
11258                 atomic_inc(&nr_bpf_events);
11259         if (event->attr.text_poke)
11260                 atomic_inc(&nr_text_poke_events);
11261
11262         if (inc) {
11263                 /*
11264                  * We need the mutex here because static_branch_enable()
11265                  * must complete *before* the perf_sched_count increment
11266                  * becomes visible.
11267                  */
11268                 if (atomic_inc_not_zero(&perf_sched_count))
11269                         goto enabled;
11270
11271                 mutex_lock(&perf_sched_mutex);
11272                 if (!atomic_read(&perf_sched_count)) {
11273                         static_branch_enable(&perf_sched_events);
11274                         /*
11275                          * Guarantee that all CPUs observe they key change and
11276                          * call the perf scheduling hooks before proceeding to
11277                          * install events that need them.
11278                          */
11279                         synchronize_rcu();
11280                 }
11281                 /*
11282                  * Now that we have waited for the sync_sched(), allow further
11283                  * increments to by-pass the mutex.
11284                  */
11285                 atomic_inc(&perf_sched_count);
11286                 mutex_unlock(&perf_sched_mutex);
11287         }
11288 enabled:
11289
11290         account_event_cpu(event, event->cpu);
11291
11292         account_pmu_sb_event(event);
11293 }
11294
11295 /*
11296  * Allocate and initialize an event structure
11297  */
11298 static struct perf_event *
11299 perf_event_alloc(struct perf_event_attr *attr, int cpu,
11300                  struct task_struct *task,
11301                  struct perf_event *group_leader,
11302                  struct perf_event *parent_event,
11303                  perf_overflow_handler_t overflow_handler,
11304                  void *context, int cgroup_fd)
11305 {
11306         struct pmu *pmu;
11307         struct perf_event *event;
11308         struct hw_perf_event *hwc;
11309         long err = -EINVAL;
11310         int node;
11311
11312         if ((unsigned)cpu >= nr_cpu_ids) {
11313                 if (!task || cpu != -1)
11314                         return ERR_PTR(-EINVAL);
11315         }
11316
11317         node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
11318         event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO,
11319                                       node);
11320         if (!event)
11321                 return ERR_PTR(-ENOMEM);
11322
11323         /*
11324          * Single events are their own group leaders, with an
11325          * empty sibling list:
11326          */
11327         if (!group_leader)
11328                 group_leader = event;
11329
11330         mutex_init(&event->child_mutex);
11331         INIT_LIST_HEAD(&event->child_list);
11332
11333         INIT_LIST_HEAD(&event->event_entry);
11334         INIT_LIST_HEAD(&event->sibling_list);
11335         INIT_LIST_HEAD(&event->active_list);
11336         init_event_group(event);
11337         INIT_LIST_HEAD(&event->rb_entry);
11338         INIT_LIST_HEAD(&event->active_entry);
11339         INIT_LIST_HEAD(&event->addr_filters.list);
11340         INIT_HLIST_NODE(&event->hlist_entry);
11341
11342
11343         init_waitqueue_head(&event->waitq);
11344         event->pending_disable = -1;
11345         init_irq_work(&event->pending, perf_pending_event);
11346
11347         mutex_init(&event->mmap_mutex);
11348         raw_spin_lock_init(&event->addr_filters.lock);
11349
11350         atomic_long_set(&event->refcount, 1);
11351         event->cpu              = cpu;
11352         event->attr             = *attr;
11353         event->group_leader     = group_leader;
11354         event->pmu              = NULL;
11355         event->oncpu            = -1;
11356
11357         event->parent           = parent_event;
11358
11359         event->ns               = get_pid_ns(task_active_pid_ns(current));
11360         event->id               = atomic64_inc_return(&perf_event_id);
11361
11362         event->state            = PERF_EVENT_STATE_INACTIVE;
11363
11364         if (task) {
11365                 event->attach_state = PERF_ATTACH_TASK;
11366                 /*
11367                  * XXX pmu::event_init needs to know what task to account to
11368                  * and we cannot use the ctx information because we need the
11369                  * pmu before we get a ctx.
11370                  */
11371                 event->hw.target = get_task_struct(task);
11372         }
11373
11374         event->clock = &local_clock;
11375         if (parent_event)
11376                 event->clock = parent_event->clock;
11377
11378         if (!overflow_handler && parent_event) {
11379                 overflow_handler = parent_event->overflow_handler;
11380                 context = parent_event->overflow_handler_context;
11381 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
11382                 if (overflow_handler == bpf_overflow_handler) {
11383                         struct bpf_prog *prog = parent_event->prog;
11384
11385                         bpf_prog_inc(prog);
11386                         event->prog = prog;
11387                         event->orig_overflow_handler =
11388                                 parent_event->orig_overflow_handler;
11389                 }
11390 #endif
11391         }
11392
11393         if (overflow_handler) {
11394                 event->overflow_handler = overflow_handler;
11395                 event->overflow_handler_context = context;
11396         } else if (is_write_backward(event)){
11397                 event->overflow_handler = perf_event_output_backward;
11398                 event->overflow_handler_context = NULL;
11399         } else {
11400                 event->overflow_handler = perf_event_output_forward;
11401                 event->overflow_handler_context = NULL;
11402         }
11403
11404         perf_event__state_init(event);
11405
11406         pmu = NULL;
11407
11408         hwc = &event->hw;
11409         hwc->sample_period = attr->sample_period;
11410         if (attr->freq && attr->sample_freq)
11411                 hwc->sample_period = 1;
11412         hwc->last_period = hwc->sample_period;
11413
11414         local64_set(&hwc->period_left, hwc->sample_period);
11415
11416         /*
11417          * We currently do not support PERF_SAMPLE_READ on inherited events.
11418          * See perf_output_read().
11419          */
11420         if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
11421                 goto err_ns;
11422
11423         if (!has_branch_stack(event))
11424                 event->attr.branch_sample_type = 0;
11425
11426         pmu = perf_init_event(event);
11427         if (IS_ERR(pmu)) {
11428                 err = PTR_ERR(pmu);
11429                 goto err_ns;
11430         }
11431
11432         /*
11433          * Disallow uncore-cgroup events, they don't make sense as the cgroup will
11434          * be different on other CPUs in the uncore mask.
11435          */
11436         if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
11437                 err = -EINVAL;
11438                 goto err_pmu;
11439         }
11440
11441         if (event->attr.aux_output &&
11442             !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
11443                 err = -EOPNOTSUPP;
11444                 goto err_pmu;
11445         }
11446
11447         if (cgroup_fd != -1) {
11448                 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
11449                 if (err)
11450                         goto err_pmu;
11451         }
11452
11453         err = exclusive_event_init(event);
11454         if (err)
11455                 goto err_pmu;
11456
11457         if (has_addr_filter(event)) {
11458                 event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
11459                                                     sizeof(struct perf_addr_filter_range),
11460                                                     GFP_KERNEL);
11461                 if (!event->addr_filter_ranges) {
11462                         err = -ENOMEM;
11463                         goto err_per_task;
11464                 }
11465
11466                 /*
11467                  * Clone the parent's vma offsets: they are valid until exec()
11468                  * even if the mm is not shared with the parent.
11469                  */
11470                 if (event->parent) {
11471                         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
11472
11473                         raw_spin_lock_irq(&ifh->lock);
11474                         memcpy(event->addr_filter_ranges,
11475                                event->parent->addr_filter_ranges,
11476                                pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
11477                         raw_spin_unlock_irq(&ifh->lock);
11478                 }
11479
11480                 /* force hw sync on the address filters */
11481                 event->addr_filters_gen = 1;
11482         }
11483
11484         if (!event->parent) {
11485                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
11486                         err = get_callchain_buffers(attr->sample_max_stack);
11487                         if (err)
11488                                 goto err_addr_filters;
11489                 }
11490         }
11491
11492         err = security_perf_event_alloc(event);
11493         if (err)
11494                 goto err_callchain_buffer;
11495
11496         /* symmetric to unaccount_event() in _free_event() */
11497         account_event(event);
11498
11499         return event;
11500
11501 err_callchain_buffer:
11502         if (!event->parent) {
11503                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
11504                         put_callchain_buffers();
11505         }
11506 err_addr_filters:
11507         kfree(event->addr_filter_ranges);
11508
11509 err_per_task:
11510         exclusive_event_destroy(event);
11511
11512 err_pmu:
11513         if (is_cgroup_event(event))
11514                 perf_detach_cgroup(event);
11515         if (event->destroy)
11516                 event->destroy(event);
11517         module_put(pmu->module);
11518 err_ns:
11519         if (event->ns)
11520                 put_pid_ns(event->ns);
11521         if (event->hw.target)
11522                 put_task_struct(event->hw.target);
11523         kmem_cache_free(perf_event_cache, event);
11524
11525         return ERR_PTR(err);
11526 }
11527
11528 static int perf_copy_attr(struct perf_event_attr __user *uattr,
11529                           struct perf_event_attr *attr)
11530 {
11531         u32 size;
11532         int ret;
11533
11534         /* Zero the full structure, so that a short copy will be nice. */
11535         memset(attr, 0, sizeof(*attr));
11536
11537         ret = get_user(size, &uattr->size);
11538         if (ret)
11539                 return ret;
11540
11541         /* ABI compatibility quirk: */
11542         if (!size)
11543                 size = PERF_ATTR_SIZE_VER0;
11544         if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
11545                 goto err_size;
11546
11547         ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
11548         if (ret) {
11549                 if (ret == -E2BIG)
11550                         goto err_size;
11551                 return ret;
11552         }
11553
11554         attr->size = size;
11555
11556         if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
11557                 return -EINVAL;
11558
11559         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
11560                 return -EINVAL;
11561
11562         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
11563                 return -EINVAL;
11564
11565         if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
11566                 u64 mask = attr->branch_sample_type;
11567
11568                 /* only using defined bits */
11569                 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
11570                         return -EINVAL;
11571
11572                 /* at least one branch bit must be set */
11573                 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
11574                         return -EINVAL;
11575
11576                 /* propagate priv level, when not set for branch */
11577                 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
11578
11579                         /* exclude_kernel checked on syscall entry */
11580                         if (!attr->exclude_kernel)
11581                                 mask |= PERF_SAMPLE_BRANCH_KERNEL;
11582
11583                         if (!attr->exclude_user)
11584                                 mask |= PERF_SAMPLE_BRANCH_USER;
11585
11586                         if (!attr->exclude_hv)
11587                                 mask |= PERF_SAMPLE_BRANCH_HV;
11588                         /*
11589                          * adjust user setting (for HW filter setup)
11590                          */
11591                         attr->branch_sample_type = mask;
11592                 }
11593                 /* privileged levels capture (kernel, hv): check permissions */
11594                 if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
11595                         ret = perf_allow_kernel(attr);
11596                         if (ret)
11597                                 return ret;
11598                 }
11599         }
11600
11601         if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
11602                 ret = perf_reg_validate(attr->sample_regs_user);
11603                 if (ret)
11604                         return ret;
11605         }
11606
11607         if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
11608                 if (!arch_perf_have_user_stack_dump())
11609                         return -ENOSYS;
11610
11611                 /*
11612                  * We have __u32 type for the size, but so far
11613                  * we can only use __u16 as maximum due to the
11614                  * __u16 sample size limit.
11615                  */
11616                 if (attr->sample_stack_user >= USHRT_MAX)
11617                         return -EINVAL;
11618                 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
11619                         return -EINVAL;
11620         }
11621
11622         if (!attr->sample_max_stack)
11623                 attr->sample_max_stack = sysctl_perf_event_max_stack;
11624
11625         if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
11626                 ret = perf_reg_validate(attr->sample_regs_intr);
11627
11628 #ifndef CONFIG_CGROUP_PERF
11629         if (attr->sample_type & PERF_SAMPLE_CGROUP)
11630                 return -EINVAL;
11631 #endif
11632         if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
11633             (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
11634                 return -EINVAL;
11635
11636 out:
11637         return ret;
11638
11639 err_size:
11640         put_user(sizeof(*attr), &uattr->size);
11641         ret = -E2BIG;
11642         goto out;
11643 }
11644
11645 static int
11646 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
11647 {
11648         struct perf_buffer *rb = NULL;
11649         int ret = -EINVAL;
11650
11651         if (!output_event)
11652                 goto set;
11653
11654         /* don't allow circular references */
11655         if (event == output_event)
11656                 goto out;
11657
11658         /*
11659          * Don't allow cross-cpu buffers
11660          */
11661         if (output_event->cpu != event->cpu)
11662                 goto out;
11663
11664         /*
11665          * If its not a per-cpu rb, it must be the same task.
11666          */
11667         if (output_event->cpu == -1 && output_event->ctx != event->ctx)
11668                 goto out;
11669
11670         /*
11671          * Mixing clocks in the same buffer is trouble you don't need.
11672          */
11673         if (output_event->clock != event->clock)
11674                 goto out;
11675
11676         /*
11677          * Either writing ring buffer from beginning or from end.
11678          * Mixing is not allowed.
11679          */
11680         if (is_write_backward(output_event) != is_write_backward(event))
11681                 goto out;
11682
11683         /*
11684          * If both events generate aux data, they must be on the same PMU
11685          */
11686         if (has_aux(event) && has_aux(output_event) &&
11687             event->pmu != output_event->pmu)
11688                 goto out;
11689
11690 set:
11691         mutex_lock(&event->mmap_mutex);
11692         /* Can't redirect output if we've got an active mmap() */
11693         if (atomic_read(&event->mmap_count))
11694                 goto unlock;
11695
11696         if (output_event) {
11697                 /* get the rb we want to redirect to */
11698                 rb = ring_buffer_get(output_event);
11699                 if (!rb)
11700                         goto unlock;
11701         }
11702
11703         ring_buffer_attach(event, rb);
11704
11705         ret = 0;
11706 unlock:
11707         mutex_unlock(&event->mmap_mutex);
11708
11709 out:
11710         return ret;
11711 }
11712
11713 static void mutex_lock_double(struct mutex *a, struct mutex *b)
11714 {
11715         if (b < a)
11716                 swap(a, b);
11717
11718         mutex_lock(a);
11719         mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
11720 }
11721
11722 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
11723 {
11724         bool nmi_safe = false;
11725
11726         switch (clk_id) {
11727         case CLOCK_MONOTONIC:
11728                 event->clock = &ktime_get_mono_fast_ns;
11729                 nmi_safe = true;
11730                 break;
11731
11732         case CLOCK_MONOTONIC_RAW:
11733                 event->clock = &ktime_get_raw_fast_ns;
11734                 nmi_safe = true;
11735                 break;
11736
11737         case CLOCK_REALTIME:
11738                 event->clock = &ktime_get_real_ns;
11739                 break;
11740
11741         case CLOCK_BOOTTIME:
11742                 event->clock = &ktime_get_boottime_ns;
11743                 break;
11744
11745         case CLOCK_TAI:
11746                 event->clock = &ktime_get_clocktai_ns;
11747                 break;
11748
11749         default:
11750                 return -EINVAL;
11751         }
11752
11753         if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
11754                 return -EINVAL;
11755
11756         return 0;
11757 }
11758
11759 /*
11760  * Variation on perf_event_ctx_lock_nested(), except we take two context
11761  * mutexes.
11762  */
11763 static struct perf_event_context *
11764 __perf_event_ctx_lock_double(struct perf_event *group_leader,
11765                              struct perf_event_context *ctx)
11766 {
11767         struct perf_event_context *gctx;
11768
11769 again:
11770         rcu_read_lock();
11771         gctx = READ_ONCE(group_leader->ctx);
11772         if (!refcount_inc_not_zero(&gctx->refcount)) {
11773                 rcu_read_unlock();
11774                 goto again;
11775         }
11776         rcu_read_unlock();
11777
11778         mutex_lock_double(&gctx->mutex, &ctx->mutex);
11779
11780         if (group_leader->ctx != gctx) {
11781                 mutex_unlock(&ctx->mutex);
11782                 mutex_unlock(&gctx->mutex);
11783                 put_ctx(gctx);
11784                 goto again;
11785         }
11786
11787         return gctx;
11788 }
11789
11790 /**
11791  * sys_perf_event_open - open a performance event, associate it to a task/cpu
11792  *
11793  * @attr_uptr:  event_id type attributes for monitoring/sampling
11794  * @pid:                target pid
11795  * @cpu:                target cpu
11796  * @group_fd:           group leader event fd
11797  */
11798 SYSCALL_DEFINE5(perf_event_open,
11799                 struct perf_event_attr __user *, attr_uptr,
11800                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
11801 {
11802         struct perf_event *group_leader = NULL, *output_event = NULL;
11803         struct perf_event *event, *sibling;
11804         struct perf_event_attr attr;
11805         struct perf_event_context *ctx, *gctx;
11806         struct file *event_file = NULL;
11807         struct fd group = {NULL, 0};
11808         struct task_struct *task = NULL;
11809         struct pmu *pmu;
11810         int event_fd;
11811         int move_group = 0;
11812         int err;
11813         int f_flags = O_RDWR;
11814         int cgroup_fd = -1;
11815
11816         /* for future expandability... */
11817         if (flags & ~PERF_FLAG_ALL)
11818                 return -EINVAL;
11819
11820         /* Do we allow access to perf_event_open(2) ? */
11821         err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
11822         if (err)
11823                 return err;
11824
11825         err = perf_copy_attr(attr_uptr, &attr);
11826         if (err)
11827                 return err;
11828
11829         if (!attr.exclude_kernel) {
11830                 err = perf_allow_kernel(&attr);
11831                 if (err)
11832                         return err;
11833         }
11834
11835         if (attr.namespaces) {
11836                 if (!perfmon_capable())
11837                         return -EACCES;
11838         }
11839
11840         if (attr.freq) {
11841                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
11842                         return -EINVAL;
11843         } else {
11844                 if (attr.sample_period & (1ULL << 63))
11845                         return -EINVAL;
11846         }
11847
11848         /* Only privileged users can get physical addresses */
11849         if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
11850                 err = perf_allow_kernel(&attr);
11851                 if (err)
11852                         return err;
11853         }
11854
11855         /* REGS_INTR can leak data, lockdown must prevent this */
11856         if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
11857                 err = security_locked_down(LOCKDOWN_PERF);
11858                 if (err)
11859                         return err;
11860         }
11861
11862         /*
11863          * In cgroup mode, the pid argument is used to pass the fd
11864          * opened to the cgroup directory in cgroupfs. The cpu argument
11865          * designates the cpu on which to monitor threads from that
11866          * cgroup.
11867          */
11868         if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
11869                 return -EINVAL;
11870
11871         if (flags & PERF_FLAG_FD_CLOEXEC)
11872                 f_flags |= O_CLOEXEC;
11873
11874         event_fd = get_unused_fd_flags(f_flags);
11875         if (event_fd < 0)
11876                 return event_fd;
11877
11878         if (group_fd != -1) {
11879                 err = perf_fget_light(group_fd, &group);
11880                 if (err)
11881                         goto err_fd;
11882                 group_leader = group.file->private_data;
11883                 if (flags & PERF_FLAG_FD_OUTPUT)
11884                         output_event = group_leader;
11885                 if (flags & PERF_FLAG_FD_NO_GROUP)
11886                         group_leader = NULL;
11887         }
11888
11889         if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
11890                 task = find_lively_task_by_vpid(pid);
11891                 if (IS_ERR(task)) {
11892                         err = PTR_ERR(task);
11893                         goto err_group_fd;
11894                 }
11895         }
11896
11897         if (task && group_leader &&
11898             group_leader->attr.inherit != attr.inherit) {
11899                 err = -EINVAL;
11900                 goto err_task;
11901         }
11902
11903         if (flags & PERF_FLAG_PID_CGROUP)
11904                 cgroup_fd = pid;
11905
11906         event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
11907                                  NULL, NULL, cgroup_fd);
11908         if (IS_ERR(event)) {
11909                 err = PTR_ERR(event);
11910                 goto err_task;
11911         }
11912
11913         if (is_sampling_event(event)) {
11914                 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
11915                         err = -EOPNOTSUPP;
11916                         goto err_alloc;
11917                 }
11918         }
11919
11920         /*
11921          * Special case software events and allow them to be part of
11922          * any hardware group.
11923          */
11924         pmu = event->pmu;
11925
11926         if (attr.use_clockid) {
11927                 err = perf_event_set_clock(event, attr.clockid);
11928                 if (err)
11929                         goto err_alloc;
11930         }
11931
11932         if (pmu->task_ctx_nr == perf_sw_context)
11933                 event->event_caps |= PERF_EV_CAP_SOFTWARE;
11934
11935         if (group_leader) {
11936                 if (is_software_event(event) &&
11937                     !in_software_context(group_leader)) {
11938                         /*
11939                          * If the event is a sw event, but the group_leader
11940                          * is on hw context.
11941                          *
11942                          * Allow the addition of software events to hw
11943                          * groups, this is safe because software events
11944                          * never fail to schedule.
11945                          */
11946                         pmu = group_leader->ctx->pmu;
11947                 } else if (!is_software_event(event) &&
11948                            is_software_event(group_leader) &&
11949                            (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
11950                         /*
11951                          * In case the group is a pure software group, and we
11952                          * try to add a hardware event, move the whole group to
11953                          * the hardware context.
11954                          */
11955                         move_group = 1;
11956                 }
11957         }
11958
11959         /*
11960          * Get the target context (task or percpu):
11961          */
11962         ctx = find_get_context(pmu, task, event);
11963         if (IS_ERR(ctx)) {
11964                 err = PTR_ERR(ctx);
11965                 goto err_alloc;
11966         }
11967
11968         /*
11969          * Look up the group leader (we will attach this event to it):
11970          */
11971         if (group_leader) {
11972                 err = -EINVAL;
11973
11974                 /*
11975                  * Do not allow a recursive hierarchy (this new sibling
11976                  * becoming part of another group-sibling):
11977                  */
11978                 if (group_leader->group_leader != group_leader)
11979                         goto err_context;
11980
11981                 /* All events in a group should have the same clock */
11982                 if (group_leader->clock != event->clock)
11983                         goto err_context;
11984
11985                 /*
11986                  * Make sure we're both events for the same CPU;
11987                  * grouping events for different CPUs is broken; since
11988                  * you can never concurrently schedule them anyhow.
11989                  */
11990                 if (group_leader->cpu != event->cpu)
11991                         goto err_context;
11992
11993                 /*
11994                  * Make sure we're both on the same task, or both
11995                  * per-CPU events.
11996                  */
11997                 if (group_leader->ctx->task != ctx->task)
11998                         goto err_context;
11999
12000                 /*
12001                  * Do not allow to attach to a group in a different task
12002                  * or CPU context. If we're moving SW events, we'll fix
12003                  * this up later, so allow that.
12004                  */
12005                 if (!move_group && group_leader->ctx != ctx)
12006                         goto err_context;
12007
12008                 /*
12009                  * Only a group leader can be exclusive or pinned
12010                  */
12011                 if (attr.exclusive || attr.pinned)
12012                         goto err_context;
12013         }
12014
12015         if (output_event) {
12016                 err = perf_event_set_output(event, output_event);
12017                 if (err)
12018                         goto err_context;
12019         }
12020
12021         event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
12022                                         f_flags);
12023         if (IS_ERR(event_file)) {
12024                 err = PTR_ERR(event_file);
12025                 event_file = NULL;
12026                 goto err_context;
12027         }
12028
12029         if (task) {
12030                 err = down_read_interruptible(&task->signal->exec_update_lock);
12031                 if (err)
12032                         goto err_file;
12033
12034                 /*
12035                  * Preserve ptrace permission check for backwards compatibility.
12036                  *
12037                  * We must hold exec_update_lock across this and any potential
12038                  * perf_install_in_context() call for this new event to
12039                  * serialize against exec() altering our credentials (and the
12040                  * perf_event_exit_task() that could imply).
12041                  */
12042                 err = -EACCES;
12043                 if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
12044                         goto err_cred;
12045         }
12046
12047         if (move_group) {
12048                 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
12049
12050                 if (gctx->task == TASK_TOMBSTONE) {
12051                         err = -ESRCH;
12052                         goto err_locked;
12053                 }
12054
12055                 /*
12056                  * Check if we raced against another sys_perf_event_open() call
12057                  * moving the software group underneath us.
12058                  */
12059                 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
12060                         /*
12061                          * If someone moved the group out from under us, check
12062                          * if this new event wound up on the same ctx, if so
12063                          * its the regular !move_group case, otherwise fail.
12064                          */
12065                         if (gctx != ctx) {
12066                                 err = -EINVAL;
12067                                 goto err_locked;
12068                         } else {
12069                                 perf_event_ctx_unlock(group_leader, gctx);
12070                                 move_group = 0;
12071                         }
12072                 }
12073
12074                 /*
12075                  * Failure to create exclusive events returns -EBUSY.
12076                  */
12077                 err = -EBUSY;
12078                 if (!exclusive_event_installable(group_leader, ctx))
12079                         goto err_locked;
12080
12081                 for_each_sibling_event(sibling, group_leader) {
12082                         if (!exclusive_event_installable(sibling, ctx))
12083                                 goto err_locked;
12084                 }
12085         } else {
12086                 mutex_lock(&ctx->mutex);
12087         }
12088
12089         if (ctx->task == TASK_TOMBSTONE) {
12090                 err = -ESRCH;
12091                 goto err_locked;
12092         }
12093
12094         if (!perf_event_validate_size(event)) {
12095                 err = -E2BIG;
12096                 goto err_locked;
12097         }
12098
12099         if (!task) {
12100                 /*
12101                  * Check if the @cpu we're creating an event for is online.
12102                  *
12103                  * We use the perf_cpu_context::ctx::mutex to serialize against
12104                  * the hotplug notifiers. See perf_event_{init,exit}_cpu().
12105                  */
12106                 struct perf_cpu_context *cpuctx =
12107                         container_of(ctx, struct perf_cpu_context, ctx);
12108
12109                 if (!cpuctx->online) {
12110                         err = -ENODEV;
12111                         goto err_locked;
12112                 }
12113         }
12114
12115         if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
12116                 err = -EINVAL;
12117                 goto err_locked;
12118         }
12119
12120         /*
12121          * Must be under the same ctx::mutex as perf_install_in_context(),
12122          * because we need to serialize with concurrent event creation.
12123          */
12124         if (!exclusive_event_installable(event, ctx)) {
12125                 err = -EBUSY;
12126                 goto err_locked;
12127         }
12128
12129         WARN_ON_ONCE(ctx->parent_ctx);
12130
12131         /*
12132          * This is the point on no return; we cannot fail hereafter. This is
12133          * where we start modifying current state.
12134          */
12135
12136         if (move_group) {
12137                 /*
12138                  * See perf_event_ctx_lock() for comments on the details
12139                  * of swizzling perf_event::ctx.
12140                  */
12141                 perf_remove_from_context(group_leader, 0);
12142                 put_ctx(gctx);
12143
12144                 for_each_sibling_event(sibling, group_leader) {
12145                         perf_remove_from_context(sibling, 0);
12146                         put_ctx(gctx);
12147                 }
12148
12149                 /*
12150                  * Wait for everybody to stop referencing the events through
12151                  * the old lists, before installing it on new lists.
12152                  */
12153                 synchronize_rcu();
12154
12155                 /*
12156                  * Install the group siblings before the group leader.
12157                  *
12158                  * Because a group leader will try and install the entire group
12159                  * (through the sibling list, which is still in-tact), we can
12160                  * end up with siblings installed in the wrong context.
12161                  *
12162                  * By installing siblings first we NO-OP because they're not
12163                  * reachable through the group lists.
12164                  */
12165                 for_each_sibling_event(sibling, group_leader) {
12166                         perf_event__state_init(sibling);
12167                         perf_install_in_context(ctx, sibling, sibling->cpu);
12168                         get_ctx(ctx);
12169                 }
12170
12171                 /*
12172                  * Removing from the context ends up with disabled
12173                  * event. What we want here is event in the initial
12174                  * startup state, ready to be add into new context.
12175                  */
12176                 perf_event__state_init(group_leader);
12177                 perf_install_in_context(ctx, group_leader, group_leader->cpu);
12178                 get_ctx(ctx);
12179         }
12180
12181         /*
12182          * Precalculate sample_data sizes; do while holding ctx::mutex such
12183          * that we're serialized against further additions and before
12184          * perf_install_in_context() which is the point the event is active and
12185          * can use these values.
12186          */
12187         perf_event__header_size(event);
12188         perf_event__id_header_size(event);
12189
12190         event->owner = current;
12191
12192         perf_install_in_context(ctx, event, event->cpu);
12193         perf_unpin_context(ctx);
12194
12195         if (move_group)
12196                 perf_event_ctx_unlock(group_leader, gctx);
12197         mutex_unlock(&ctx->mutex);
12198
12199         if (task) {
12200                 up_read(&task->signal->exec_update_lock);
12201                 put_task_struct(task);
12202         }
12203
12204         mutex_lock(&current->perf_event_mutex);
12205         list_add_tail(&event->owner_entry, &current->perf_event_list);
12206         mutex_unlock(&current->perf_event_mutex);
12207
12208         /*
12209          * Drop the reference on the group_event after placing the
12210          * new event on the sibling_list. This ensures destruction
12211          * of the group leader will find the pointer to itself in
12212          * perf_group_detach().
12213          */
12214         fdput(group);
12215         fd_install(event_fd, event_file);
12216         return event_fd;
12217
12218 err_locked:
12219         if (move_group)
12220                 perf_event_ctx_unlock(group_leader, gctx);
12221         mutex_unlock(&ctx->mutex);
12222 err_cred:
12223         if (task)
12224                 up_read(&task->signal->exec_update_lock);
12225 err_file:
12226         fput(event_file);
12227 err_context:
12228         perf_unpin_context(ctx);
12229         put_ctx(ctx);
12230 err_alloc:
12231         /*
12232          * If event_file is set, the fput() above will have called ->release()
12233          * and that will take care of freeing the event.
12234          */
12235         if (!event_file)
12236                 free_event(event);
12237 err_task:
12238         if (task)
12239                 put_task_struct(task);
12240 err_group_fd:
12241         fdput(group);
12242 err_fd:
12243         put_unused_fd(event_fd);
12244         return err;
12245 }
12246
12247 /**
12248  * perf_event_create_kernel_counter
12249  *
12250  * @attr: attributes of the counter to create
12251  * @cpu: cpu in which the counter is bound
12252  * @task: task to profile (NULL for percpu)
12253  */
12254 struct perf_event *
12255 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
12256                                  struct task_struct *task,
12257                                  perf_overflow_handler_t overflow_handler,
12258                                  void *context)
12259 {
12260         struct perf_event_context *ctx;
12261         struct perf_event *event;
12262         int err;
12263
12264         /*
12265          * Grouping is not supported for kernel events, neither is 'AUX',
12266          * make sure the caller's intentions are adjusted.
12267          */
12268         if (attr->aux_output)
12269                 return ERR_PTR(-EINVAL);
12270
12271         event = perf_event_alloc(attr, cpu, task, NULL, NULL,
12272                                  overflow_handler, context, -1);
12273         if (IS_ERR(event)) {
12274                 err = PTR_ERR(event);
12275                 goto err;
12276         }
12277
12278         /* Mark owner so we could distinguish it from user events. */
12279         event->owner = TASK_TOMBSTONE;
12280
12281         /*
12282          * Get the target context (task or percpu):
12283          */
12284         ctx = find_get_context(event->pmu, task, event);
12285         if (IS_ERR(ctx)) {
12286                 err = PTR_ERR(ctx);
12287                 goto err_free;
12288         }
12289
12290         WARN_ON_ONCE(ctx->parent_ctx);
12291         mutex_lock(&ctx->mutex);
12292         if (ctx->task == TASK_TOMBSTONE) {
12293                 err = -ESRCH;
12294                 goto err_unlock;
12295         }
12296
12297         if (!task) {
12298                 /*
12299                  * Check if the @cpu we're creating an event for is online.
12300                  *
12301                  * We use the perf_cpu_context::ctx::mutex to serialize against
12302                  * the hotplug notifiers. See perf_event_{init,exit}_cpu().
12303                  */
12304                 struct perf_cpu_context *cpuctx =
12305                         container_of(ctx, struct perf_cpu_context, ctx);
12306                 if (!cpuctx->online) {
12307                         err = -ENODEV;
12308                         goto err_unlock;
12309                 }
12310         }
12311
12312         if (!exclusive_event_installable(event, ctx)) {
12313                 err = -EBUSY;
12314                 goto err_unlock;
12315         }
12316
12317         perf_install_in_context(ctx, event, event->cpu);
12318         perf_unpin_context(ctx);
12319         mutex_unlock(&ctx->mutex);
12320
12321         return event;
12322
12323 err_unlock:
12324         mutex_unlock(&ctx->mutex);
12325         perf_unpin_context(ctx);
12326         put_ctx(ctx);
12327 err_free:
12328         free_event(event);
12329 err:
12330         return ERR_PTR(err);
12331 }
12332 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
12333
12334 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
12335 {
12336         struct perf_event_context *src_ctx;
12337         struct perf_event_context *dst_ctx;
12338         struct perf_event *event, *tmp;
12339         LIST_HEAD(events);
12340
12341         src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
12342         dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
12343
12344         /*
12345          * See perf_event_ctx_lock() for comments on the details
12346          * of swizzling perf_event::ctx.
12347          */
12348         mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
12349         list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
12350                                  event_entry) {
12351                 perf_remove_from_context(event, 0);
12352                 unaccount_event_cpu(event, src_cpu);
12353                 put_ctx(src_ctx);
12354                 list_add(&event->migrate_entry, &events);
12355         }
12356
12357         /*
12358          * Wait for the events to quiesce before re-instating them.
12359          */
12360         synchronize_rcu();
12361
12362         /*
12363          * Re-instate events in 2 passes.
12364          *
12365          * Skip over group leaders and only install siblings on this first
12366          * pass, siblings will not get enabled without a leader, however a
12367          * leader will enable its siblings, even if those are still on the old
12368          * context.
12369          */
12370         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
12371                 if (event->group_leader == event)
12372                         continue;
12373
12374                 list_del(&event->migrate_entry);
12375                 if (event->state >= PERF_EVENT_STATE_OFF)
12376                         event->state = PERF_EVENT_STATE_INACTIVE;
12377                 account_event_cpu(event, dst_cpu);
12378                 perf_install_in_context(dst_ctx, event, dst_cpu);
12379                 get_ctx(dst_ctx);
12380         }
12381
12382         /*
12383          * Once all the siblings are setup properly, install the group leaders
12384          * to make it go.
12385          */
12386         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
12387                 list_del(&event->migrate_entry);
12388                 if (event->state >= PERF_EVENT_STATE_OFF)
12389                         event->state = PERF_EVENT_STATE_INACTIVE;
12390                 account_event_cpu(event, dst_cpu);
12391                 perf_install_in_context(dst_ctx, event, dst_cpu);
12392                 get_ctx(dst_ctx);
12393         }
12394         mutex_unlock(&dst_ctx->mutex);
12395         mutex_unlock(&src_ctx->mutex);
12396 }
12397 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
12398
12399 static void sync_child_event(struct perf_event *child_event)
12400 {
12401         struct perf_event *parent_event = child_event->parent;
12402         u64 child_val;
12403
12404         if (child_event->attr.inherit_stat) {
12405                 struct task_struct *task = child_event->ctx->task;
12406
12407                 if (task && task != TASK_TOMBSTONE)
12408                         perf_event_read_event(child_event, task);
12409         }
12410
12411         child_val = perf_event_count(child_event);
12412
12413         /*
12414          * Add back the child's count to the parent's count:
12415          */
12416         atomic64_add(child_val, &parent_event->child_count);
12417         atomic64_add(child_event->total_time_enabled,
12418                      &parent_event->child_total_time_enabled);
12419         atomic64_add(child_event->total_time_running,
12420                      &parent_event->child_total_time_running);
12421 }
12422
12423 static void
12424 perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
12425 {
12426         struct perf_event *parent_event = event->parent;
12427         unsigned long detach_flags = 0;
12428
12429         if (parent_event) {
12430                 /*
12431                  * Do not destroy the 'original' grouping; because of the
12432                  * context switch optimization the original events could've
12433                  * ended up in a random child task.
12434                  *
12435                  * If we were to destroy the original group, all group related
12436                  * operations would cease to function properly after this
12437                  * random child dies.
12438                  *
12439                  * Do destroy all inherited groups, we don't care about those
12440                  * and being thorough is better.
12441                  */
12442                 detach_flags = DETACH_GROUP | DETACH_CHILD;
12443                 mutex_lock(&parent_event->child_mutex);
12444         }
12445
12446         perf_remove_from_context(event, detach_flags);
12447
12448         raw_spin_lock_irq(&ctx->lock);
12449         if (event->state > PERF_EVENT_STATE_EXIT)
12450                 perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
12451         raw_spin_unlock_irq(&ctx->lock);
12452
12453         /*
12454          * Child events can be freed.
12455          */
12456         if (parent_event) {
12457                 mutex_unlock(&parent_event->child_mutex);
12458                 /*
12459                  * Kick perf_poll() for is_event_hup();
12460                  */
12461                 perf_event_wakeup(parent_event);
12462                 free_event(event);
12463                 put_event(parent_event);
12464                 return;
12465         }
12466
12467         /*
12468          * Parent events are governed by their filedesc, retain them.
12469          */
12470         perf_event_wakeup(event);
12471 }
12472
12473 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
12474 {
12475         struct perf_event_context *child_ctx, *clone_ctx = NULL;
12476         struct perf_event *child_event, *next;
12477
12478         WARN_ON_ONCE(child != current);
12479
12480         child_ctx = perf_pin_task_context(child, ctxn);
12481         if (!child_ctx)
12482                 return;
12483
12484         /*
12485          * In order to reduce the amount of tricky in ctx tear-down, we hold
12486          * ctx::mutex over the entire thing. This serializes against almost
12487          * everything that wants to access the ctx.
12488          *
12489          * The exception is sys_perf_event_open() /
12490          * perf_event_create_kernel_count() which does find_get_context()
12491          * without ctx::mutex (it cannot because of the move_group double mutex
12492          * lock thing). See the comments in perf_install_in_context().
12493          */
12494         mutex_lock(&child_ctx->mutex);
12495
12496         /*
12497          * In a single ctx::lock section, de-schedule the events and detach the
12498          * context from the task such that we cannot ever get it scheduled back
12499          * in.
12500          */
12501         raw_spin_lock_irq(&child_ctx->lock);
12502         task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
12503
12504         /*
12505          * Now that the context is inactive, destroy the task <-> ctx relation
12506          * and mark the context dead.
12507          */
12508         RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
12509         put_ctx(child_ctx); /* cannot be last */
12510         WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
12511         put_task_struct(current); /* cannot be last */
12512
12513         clone_ctx = unclone_ctx(child_ctx);
12514         raw_spin_unlock_irq(&child_ctx->lock);
12515
12516         if (clone_ctx)
12517                 put_ctx(clone_ctx);
12518
12519         /*
12520          * Report the task dead after unscheduling the events so that we
12521          * won't get any samples after PERF_RECORD_EXIT. We can however still
12522          * get a few PERF_RECORD_READ events.
12523          */
12524         perf_event_task(child, child_ctx, 0);
12525
12526         list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
12527                 perf_event_exit_event(child_event, child_ctx);
12528
12529         mutex_unlock(&child_ctx->mutex);
12530
12531         put_ctx(child_ctx);
12532 }
12533
12534 /*
12535  * When a child task exits, feed back event values to parent events.
12536  *
12537  * Can be called with exec_update_lock held when called from
12538  * setup_new_exec().
12539  */
12540 void perf_event_exit_task(struct task_struct *child)
12541 {
12542         struct perf_event *event, *tmp;
12543         int ctxn;
12544
12545         mutex_lock(&child->perf_event_mutex);
12546         list_for_each_entry_safe(event, tmp, &child->perf_event_list,
12547                                  owner_entry) {
12548                 list_del_init(&event->owner_entry);
12549
12550                 /*
12551                  * Ensure the list deletion is visible before we clear
12552                  * the owner, closes a race against perf_release() where
12553                  * we need to serialize on the owner->perf_event_mutex.
12554                  */
12555                 smp_store_release(&event->owner, NULL);
12556         }
12557         mutex_unlock(&child->perf_event_mutex);
12558
12559         for_each_task_context_nr(ctxn)
12560                 perf_event_exit_task_context(child, ctxn);
12561
12562         /*
12563          * The perf_event_exit_task_context calls perf_event_task
12564          * with child's task_ctx, which generates EXIT events for
12565          * child contexts and sets child->perf_event_ctxp[] to NULL.
12566          * At this point we need to send EXIT events to cpu contexts.
12567          */
12568         perf_event_task(child, NULL, 0);
12569 }
12570
12571 static void perf_free_event(struct perf_event *event,
12572                             struct perf_event_context *ctx)
12573 {
12574         struct perf_event *parent = event->parent;
12575
12576         if (WARN_ON_ONCE(!parent))
12577                 return;
12578
12579         mutex_lock(&parent->child_mutex);
12580         list_del_init(&event->child_list);
12581         mutex_unlock(&parent->child_mutex);
12582
12583         put_event(parent);
12584
12585         raw_spin_lock_irq(&ctx->lock);
12586         perf_group_detach(event);
12587         list_del_event(event, ctx);
12588         raw_spin_unlock_irq(&ctx->lock);
12589         free_event(event);
12590 }
12591
12592 /*
12593  * Free a context as created by inheritance by perf_event_init_task() below,
12594  * used by fork() in case of fail.
12595  *
12596  * Even though the task has never lived, the context and events have been
12597  * exposed through the child_list, so we must take care tearing it all down.
12598  */
12599 void perf_event_free_task(struct task_struct *task)
12600 {
12601         struct perf_event_context *ctx;
12602         struct perf_event *event, *tmp;
12603         int ctxn;
12604
12605         for_each_task_context_nr(ctxn) {
12606                 ctx = task->perf_event_ctxp[ctxn];
12607                 if (!ctx)
12608                         continue;
12609
12610                 mutex_lock(&ctx->mutex);
12611                 raw_spin_lock_irq(&ctx->lock);
12612                 /*
12613                  * Destroy the task <-> ctx relation and mark the context dead.
12614                  *
12615                  * This is important because even though the task hasn't been
12616                  * exposed yet the context has been (through child_list).
12617                  */
12618                 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
12619                 WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
12620                 put_task_struct(task); /* cannot be last */
12621                 raw_spin_unlock_irq(&ctx->lock);
12622
12623                 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
12624                         perf_free_event(event, ctx);
12625
12626                 mutex_unlock(&ctx->mutex);
12627
12628                 /*
12629                  * perf_event_release_kernel() could've stolen some of our
12630                  * child events and still have them on its free_list. In that
12631                  * case we must wait for these events to have been freed (in
12632                  * particular all their references to this task must've been
12633                  * dropped).
12634                  *
12635                  * Without this copy_process() will unconditionally free this
12636                  * task (irrespective of its reference count) and
12637                  * _free_event()'s put_task_struct(event->hw.target) will be a
12638                  * use-after-free.
12639                  *
12640                  * Wait for all events to drop their context reference.
12641                  */
12642                 wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
12643                 put_ctx(ctx); /* must be last */
12644         }
12645 }
12646
12647 void perf_event_delayed_put(struct task_struct *task)
12648 {
12649         int ctxn;
12650
12651         for_each_task_context_nr(ctxn)
12652                 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
12653 }
12654
12655 struct file *perf_event_get(unsigned int fd)
12656 {
12657         struct file *file = fget(fd);
12658         if (!file)
12659                 return ERR_PTR(-EBADF);
12660
12661         if (file->f_op != &perf_fops) {
12662                 fput(file);
12663                 return ERR_PTR(-EBADF);
12664         }
12665
12666         return file;
12667 }
12668
12669 const struct perf_event *perf_get_event(struct file *file)
12670 {
12671         if (file->f_op != &perf_fops)
12672                 return ERR_PTR(-EINVAL);
12673
12674         return file->private_data;
12675 }
12676
12677 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
12678 {
12679         if (!event)
12680                 return ERR_PTR(-EINVAL);
12681
12682         return &event->attr;
12683 }
12684
12685 /*
12686  * Inherit an event from parent task to child task.
12687  *
12688  * Returns:
12689  *  - valid pointer on success
12690  *  - NULL for orphaned events
12691  *  - IS_ERR() on error
12692  */
12693 static struct perf_event *
12694 inherit_event(struct perf_event *parent_event,
12695               struct task_struct *parent,
12696               struct perf_event_context *parent_ctx,
12697               struct task_struct *child,
12698               struct perf_event *group_leader,
12699               struct perf_event_context *child_ctx)
12700 {
12701         enum perf_event_state parent_state = parent_event->state;
12702         struct perf_event *child_event;
12703         unsigned long flags;
12704
12705         /*
12706          * Instead of creating recursive hierarchies of events,
12707          * we link inherited events back to the original parent,
12708          * which has a filp for sure, which we use as the reference
12709          * count:
12710          */
12711         if (parent_event->parent)
12712                 parent_event = parent_event->parent;
12713
12714         child_event = perf_event_alloc(&parent_event->attr,
12715                                            parent_event->cpu,
12716                                            child,
12717                                            group_leader, parent_event,
12718                                            NULL, NULL, -1);
12719         if (IS_ERR(child_event))
12720                 return child_event;
12721
12722
12723         if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
12724             !child_ctx->task_ctx_data) {
12725                 struct pmu *pmu = child_event->pmu;
12726
12727                 child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
12728                 if (!child_ctx->task_ctx_data) {
12729                         free_event(child_event);
12730                         return ERR_PTR(-ENOMEM);
12731                 }
12732         }
12733
12734         /*
12735          * is_orphaned_event() and list_add_tail(&parent_event->child_list)
12736          * must be under the same lock in order to serialize against
12737          * perf_event_release_kernel(), such that either we must observe
12738          * is_orphaned_event() or they will observe us on the child_list.
12739          */
12740         mutex_lock(&parent_event->child_mutex);
12741         if (is_orphaned_event(parent_event) ||
12742             !atomic_long_inc_not_zero(&parent_event->refcount)) {
12743                 mutex_unlock(&parent_event->child_mutex);
12744                 /* task_ctx_data is freed with child_ctx */
12745                 free_event(child_event);
12746                 return NULL;
12747         }
12748
12749         get_ctx(child_ctx);
12750
12751         /*
12752          * Make the child state follow the state of the parent event,
12753          * not its attr.disabled bit.  We hold the parent's mutex,
12754          * so we won't race with perf_event_{en, dis}able_family.
12755          */
12756         if (parent_state >= PERF_EVENT_STATE_INACTIVE)
12757                 child_event->state = PERF_EVENT_STATE_INACTIVE;
12758         else
12759                 child_event->state = PERF_EVENT_STATE_OFF;
12760
12761         if (parent_event->attr.freq) {
12762                 u64 sample_period = parent_event->hw.sample_period;
12763                 struct hw_perf_event *hwc = &child_event->hw;
12764
12765                 hwc->sample_period = sample_period;
12766                 hwc->last_period   = sample_period;
12767
12768                 local64_set(&hwc->period_left, sample_period);
12769         }
12770
12771         child_event->ctx = child_ctx;
12772         child_event->overflow_handler = parent_event->overflow_handler;
12773         child_event->overflow_handler_context
12774                 = parent_event->overflow_handler_context;
12775
12776         /*
12777          * Precalculate sample_data sizes
12778          */
12779         perf_event__header_size(child_event);
12780         perf_event__id_header_size(child_event);
12781
12782         /*
12783          * Link it up in the child's context:
12784          */
12785         raw_spin_lock_irqsave(&child_ctx->lock, flags);
12786         add_event_to_ctx(child_event, child_ctx);
12787         child_event->attach_state |= PERF_ATTACH_CHILD;
12788         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
12789
12790         /*
12791          * Link this into the parent event's child list
12792          */
12793         list_add_tail(&child_event->child_list, &parent_event->child_list);
12794         mutex_unlock(&parent_event->child_mutex);
12795
12796         return child_event;
12797 }
12798
12799 /*
12800  * Inherits an event group.
12801  *
12802  * This will quietly suppress orphaned events; !inherit_event() is not an error.
12803  * This matches with perf_event_release_kernel() removing all child events.
12804  *
12805  * Returns:
12806  *  - 0 on success
12807  *  - <0 on error
12808  */
12809 static int inherit_group(struct perf_event *parent_event,
12810               struct task_struct *parent,
12811               struct perf_event_context *parent_ctx,
12812               struct task_struct *child,
12813               struct perf_event_context *child_ctx)
12814 {
12815         struct perf_event *leader;
12816         struct perf_event *sub;
12817         struct perf_event *child_ctr;
12818
12819         leader = inherit_event(parent_event, parent, parent_ctx,
12820                                  child, NULL, child_ctx);
12821         if (IS_ERR(leader))
12822                 return PTR_ERR(leader);
12823         /*
12824          * @leader can be NULL here because of is_orphaned_event(). In this
12825          * case inherit_event() will create individual events, similar to what
12826          * perf_group_detach() would do anyway.
12827          */
12828         for_each_sibling_event(sub, parent_event) {
12829                 child_ctr = inherit_event(sub, parent, parent_ctx,
12830                                             child, leader, child_ctx);
12831                 if (IS_ERR(child_ctr))
12832                         return PTR_ERR(child_ctr);
12833
12834                 if (sub->aux_event == parent_event && child_ctr &&
12835                     !perf_get_aux_event(child_ctr, leader))
12836                         return -EINVAL;
12837         }
12838         return 0;
12839 }
12840
12841 /*
12842  * Creates the child task context and tries to inherit the event-group.
12843  *
12844  * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
12845  * inherited_all set when we 'fail' to inherit an orphaned event; this is
12846  * consistent with perf_event_release_kernel() removing all child events.
12847  *
12848  * Returns:
12849  *  - 0 on success
12850  *  - <0 on error
12851  */
12852 static int
12853 inherit_task_group(struct perf_event *event, struct task_struct *parent,
12854                    struct perf_event_context *parent_ctx,
12855                    struct task_struct *child, int ctxn,
12856                    int *inherited_all)
12857 {
12858         int ret;
12859         struct perf_event_context *child_ctx;
12860
12861         if (!event->attr.inherit) {
12862                 *inherited_all = 0;
12863                 return 0;
12864         }
12865
12866         child_ctx = child->perf_event_ctxp[ctxn];
12867         if (!child_ctx) {
12868                 /*
12869                  * This is executed from the parent task context, so
12870                  * inherit events that have been marked for cloning.
12871                  * First allocate and initialize a context for the
12872                  * child.
12873                  */
12874                 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
12875                 if (!child_ctx)
12876                         return -ENOMEM;
12877
12878                 child->perf_event_ctxp[ctxn] = child_ctx;
12879         }
12880
12881         ret = inherit_group(event, parent, parent_ctx,
12882                             child, child_ctx);
12883
12884         if (ret)
12885                 *inherited_all = 0;
12886
12887         return ret;
12888 }
12889
12890 /*
12891  * Initialize the perf_event context in task_struct
12892  */
12893 static int perf_event_init_context(struct task_struct *child, int ctxn)
12894 {
12895         struct perf_event_context *child_ctx, *parent_ctx;
12896         struct perf_event_context *cloned_ctx;
12897         struct perf_event *event;
12898         struct task_struct *parent = current;
12899         int inherited_all = 1;
12900         unsigned long flags;
12901         int ret = 0;
12902
12903         if (likely(!parent->perf_event_ctxp[ctxn]))
12904                 return 0;
12905
12906         /*
12907          * If the parent's context is a clone, pin it so it won't get
12908          * swapped under us.
12909          */
12910         parent_ctx = perf_pin_task_context(parent, ctxn);
12911         if (!parent_ctx)
12912                 return 0;
12913
12914         /*
12915          * No need to check if parent_ctx != NULL here; since we saw
12916          * it non-NULL earlier, the only reason for it to become NULL
12917          * is if we exit, and since we're currently in the middle of
12918          * a fork we can't be exiting at the same time.
12919          */
12920
12921         /*
12922          * Lock the parent list. No need to lock the child - not PID
12923          * hashed yet and not running, so nobody can access it.
12924          */
12925         mutex_lock(&parent_ctx->mutex);
12926
12927         /*
12928          * We dont have to disable NMIs - we are only looking at
12929          * the list, not manipulating it:
12930          */
12931         perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
12932                 ret = inherit_task_group(event, parent, parent_ctx,
12933                                          child, ctxn, &inherited_all);
12934                 if (ret)
12935                         goto out_unlock;
12936         }
12937
12938         /*
12939          * We can't hold ctx->lock when iterating the ->flexible_group list due
12940          * to allocations, but we need to prevent rotation because
12941          * rotate_ctx() will change the list from interrupt context.
12942          */
12943         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
12944         parent_ctx->rotate_disable = 1;
12945         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
12946
12947         perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
12948                 ret = inherit_task_group(event, parent, parent_ctx,
12949                                          child, ctxn, &inherited_all);
12950                 if (ret)
12951                         goto out_unlock;
12952         }
12953
12954         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
12955         parent_ctx->rotate_disable = 0;
12956
12957         child_ctx = child->perf_event_ctxp[ctxn];
12958
12959         if (child_ctx && inherited_all) {
12960                 /*
12961                  * Mark the child context as a clone of the parent
12962                  * context, or of whatever the parent is a clone of.
12963                  *
12964                  * Note that if the parent is a clone, the holding of
12965                  * parent_ctx->lock avoids it from being uncloned.
12966                  */
12967                 cloned_ctx = parent_ctx->parent_ctx;
12968                 if (cloned_ctx) {
12969                         child_ctx->parent_ctx = cloned_ctx;
12970                         child_ctx->parent_gen = parent_ctx->parent_gen;
12971                 } else {
12972                         child_ctx->parent_ctx = parent_ctx;
12973                         child_ctx->parent_gen = parent_ctx->generation;
12974                 }
12975                 get_ctx(child_ctx->parent_ctx);
12976         }
12977
12978         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
12979 out_unlock:
12980         mutex_unlock(&parent_ctx->mutex);
12981
12982         perf_unpin_context(parent_ctx);
12983         put_ctx(parent_ctx);
12984
12985         return ret;
12986 }
12987
12988 /*
12989  * Initialize the perf_event context in task_struct
12990  */
12991 int perf_event_init_task(struct task_struct *child)
12992 {
12993         int ctxn, ret;
12994
12995         memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
12996         mutex_init(&child->perf_event_mutex);
12997         INIT_LIST_HEAD(&child->perf_event_list);
12998
12999         for_each_task_context_nr(ctxn) {
13000                 ret = perf_event_init_context(child, ctxn);
13001                 if (ret) {
13002                         perf_event_free_task(child);
13003                         return ret;
13004                 }
13005         }
13006
13007         return 0;
13008 }
13009
13010 static void __init perf_event_init_all_cpus(void)
13011 {
13012         struct swevent_htable *swhash;
13013         int cpu;
13014
13015         zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
13016
13017         for_each_possible_cpu(cpu) {
13018                 swhash = &per_cpu(swevent_htable, cpu);
13019                 mutex_init(&swhash->hlist_mutex);
13020                 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
13021
13022                 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
13023                 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
13024
13025 #ifdef CONFIG_CGROUP_PERF
13026                 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
13027 #endif
13028                 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
13029         }
13030 }
13031
13032 static void perf_swevent_init_cpu(unsigned int cpu)
13033 {
13034         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
13035
13036         mutex_lock(&swhash->hlist_mutex);
13037         if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
13038                 struct swevent_hlist *hlist;
13039
13040                 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
13041                 WARN_ON(!hlist);
13042                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
13043         }
13044         mutex_unlock(&swhash->hlist_mutex);
13045 }
13046
13047 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
13048 static void __perf_event_exit_context(void *__info)
13049 {
13050         struct perf_event_context *ctx = __info;
13051         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
13052         struct perf_event *event;
13053
13054         raw_spin_lock(&ctx->lock);
13055         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
13056         list_for_each_entry(event, &ctx->event_list, event_entry)
13057                 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
13058         raw_spin_unlock(&ctx->lock);
13059 }
13060
13061 static void perf_event_exit_cpu_context(int cpu)
13062 {
13063         struct perf_cpu_context *cpuctx;
13064         struct perf_event_context *ctx;
13065         struct pmu *pmu;
13066
13067         mutex_lock(&pmus_lock);
13068         list_for_each_entry(pmu, &pmus, entry) {
13069                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
13070                 ctx = &cpuctx->ctx;
13071
13072                 mutex_lock(&ctx->mutex);
13073                 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
13074                 cpuctx->online = 0;
13075                 mutex_unlock(&ctx->mutex);
13076         }
13077         cpumask_clear_cpu(cpu, perf_online_mask);
13078         mutex_unlock(&pmus_lock);
13079 }
13080 #else
13081
13082 static void perf_event_exit_cpu_context(int cpu) { }
13083
13084 #endif
13085
13086 int perf_event_init_cpu(unsigned int cpu)
13087 {
13088         struct perf_cpu_context *cpuctx;
13089         struct perf_event_context *ctx;
13090         struct pmu *pmu;
13091
13092         perf_swevent_init_cpu(cpu);
13093
13094         mutex_lock(&pmus_lock);
13095         cpumask_set_cpu(cpu, perf_online_mask);
13096         list_for_each_entry(pmu, &pmus, entry) {
13097                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
13098                 ctx = &cpuctx->ctx;
13099
13100                 mutex_lock(&ctx->mutex);
13101                 cpuctx->online = 1;
13102                 mutex_unlock(&ctx->mutex);
13103         }
13104         mutex_unlock(&pmus_lock);
13105
13106         return 0;
13107 }
13108
13109 int perf_event_exit_cpu(unsigned int cpu)
13110 {
13111         perf_event_exit_cpu_context(cpu);
13112         return 0;
13113 }
13114
13115 static int
13116 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
13117 {
13118         int cpu;
13119
13120         for_each_online_cpu(cpu)
13121                 perf_event_exit_cpu(cpu);
13122
13123         return NOTIFY_OK;
13124 }
13125
13126 /*
13127  * Run the perf reboot notifier at the very last possible moment so that
13128  * the generic watchdog code runs as long as possible.
13129  */
13130 static struct notifier_block perf_reboot_notifier = {
13131         .notifier_call = perf_reboot,
13132         .priority = INT_MIN,
13133 };
13134
13135 void __init perf_event_init(void)
13136 {
13137         int ret;
13138
13139         idr_init(&pmu_idr);
13140
13141         perf_event_init_all_cpus();
13142         init_srcu_struct(&pmus_srcu);
13143         perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
13144         perf_pmu_register(&perf_cpu_clock, NULL, -1);
13145         perf_pmu_register(&perf_task_clock, NULL, -1);
13146         perf_tp_register();
13147         perf_event_init_cpu(smp_processor_id());
13148         register_reboot_notifier(&perf_reboot_notifier);
13149
13150         ret = init_hw_breakpoint();
13151         WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
13152
13153         perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC);
13154
13155         /*
13156          * Build time assertion that we keep the data_head at the intended
13157          * location.  IOW, validation we got the __reserved[] size right.
13158          */
13159         BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
13160                      != 1024);
13161 }
13162
13163 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
13164                               char *page)
13165 {
13166         struct perf_pmu_events_attr *pmu_attr =
13167                 container_of(attr, struct perf_pmu_events_attr, attr);
13168
13169         if (pmu_attr->event_str)
13170                 return sprintf(page, "%s\n", pmu_attr->event_str);
13171
13172         return 0;
13173 }
13174 EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
13175
13176 static int __init perf_event_sysfs_init(void)
13177 {
13178         struct pmu *pmu;
13179         int ret;
13180
13181         mutex_lock(&pmus_lock);
13182
13183         ret = bus_register(&pmu_bus);
13184         if (ret)
13185                 goto unlock;
13186
13187         list_for_each_entry(pmu, &pmus, entry) {
13188                 if (!pmu->name || pmu->type < 0)
13189                         continue;
13190
13191                 ret = pmu_dev_alloc(pmu);
13192                 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
13193         }
13194         pmu_bus_running = 1;
13195         ret = 0;
13196
13197 unlock:
13198         mutex_unlock(&pmus_lock);
13199
13200         return ret;
13201 }
13202 device_initcall(perf_event_sysfs_init);
13203
13204 #ifdef CONFIG_CGROUP_PERF
13205 static struct cgroup_subsys_state *
13206 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
13207 {
13208         struct perf_cgroup *jc;
13209
13210         jc = kzalloc(sizeof(*jc), GFP_KERNEL);
13211         if (!jc)
13212                 return ERR_PTR(-ENOMEM);
13213
13214         jc->info = alloc_percpu(struct perf_cgroup_info);
13215         if (!jc->info) {
13216                 kfree(jc);
13217                 return ERR_PTR(-ENOMEM);
13218         }
13219
13220         return &jc->css;
13221 }
13222
13223 static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
13224 {
13225         struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
13226
13227         free_percpu(jc->info);
13228         kfree(jc);
13229 }
13230
13231 static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
13232 {
13233         perf_event_cgroup(css->cgroup);
13234         return 0;
13235 }
13236
13237 static int __perf_cgroup_move(void *info)
13238 {
13239         struct task_struct *task = info;
13240         rcu_read_lock();
13241         perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
13242         rcu_read_unlock();
13243         return 0;
13244 }
13245
13246 static void perf_cgroup_attach(struct cgroup_taskset *tset)
13247 {
13248         struct task_struct *task;
13249         struct cgroup_subsys_state *css;
13250
13251         cgroup_taskset_for_each(task, css, tset)
13252                 task_function_call(task, __perf_cgroup_move, task);
13253 }
13254
13255 struct cgroup_subsys perf_event_cgrp_subsys = {
13256         .css_alloc      = perf_cgroup_css_alloc,
13257         .css_free       = perf_cgroup_css_free,
13258         .css_online     = perf_cgroup_css_online,
13259         .attach         = perf_cgroup_attach,
13260         /*
13261          * Implicitly enable on dfl hierarchy so that perf events can
13262          * always be filtered by cgroup2 path as long as perf_event
13263          * controller is not mounted on a legacy hierarchy.
13264          */
13265         .implicit_on_dfl = true,
13266         .threaded       = true,
13267 };
13268 #endif /* CONFIG_CGROUP_PERF */