tools: Free BTF objects at various locations
[linux-2.6-microblaze.git] / tools / perf / util / bpf_counter.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /* Copyright (c) 2019 Facebook */
4
5 #include <assert.h>
6 #include <limits.h>
7 #include <unistd.h>
8 #include <sys/file.h>
9 #include <sys/time.h>
10 #include <linux/err.h>
11 #include <linux/zalloc.h>
12 #include <api/fs/fs.h>
13 #include <perf/bpf_perf.h>
14
15 #include "bpf_counter.h"
16 #include "counts.h"
17 #include "debug.h"
18 #include "evsel.h"
19 #include "evlist.h"
20 #include "target.h"
21 #include "cgroup.h"
22 #include "cpumap.h"
23 #include "thread_map.h"
24
25 #include "bpf_skel/bpf_prog_profiler.skel.h"
26 #include "bpf_skel/bperf_u.h"
27 #include "bpf_skel/bperf_leader.skel.h"
28 #include "bpf_skel/bperf_follower.skel.h"
29
30 #define ATTR_MAP_SIZE 16
31
32 static inline void *u64_to_ptr(__u64 ptr)
33 {
34         return (void *)(unsigned long)ptr;
35 }
36
37 static struct bpf_counter *bpf_counter_alloc(void)
38 {
39         struct bpf_counter *counter;
40
41         counter = zalloc(sizeof(*counter));
42         if (counter)
43                 INIT_LIST_HEAD(&counter->list);
44         return counter;
45 }
46
47 static int bpf_program_profiler__destroy(struct evsel *evsel)
48 {
49         struct bpf_counter *counter, *tmp;
50
51         list_for_each_entry_safe(counter, tmp,
52                                  &evsel->bpf_counter_list, list) {
53                 list_del_init(&counter->list);
54                 bpf_prog_profiler_bpf__destroy(counter->skel);
55                 free(counter);
56         }
57         assert(list_empty(&evsel->bpf_counter_list));
58
59         return 0;
60 }
61
62 static char *bpf_target_prog_name(int tgt_fd)
63 {
64         struct bpf_prog_info_linear *info_linear;
65         struct bpf_func_info *func_info;
66         const struct btf_type *t;
67         struct btf *btf = NULL;
68         char *name = NULL;
69
70         info_linear = bpf_program__get_prog_info_linear(
71                 tgt_fd, 1UL << BPF_PROG_INFO_FUNC_INFO);
72         if (IS_ERR_OR_NULL(info_linear)) {
73                 pr_debug("failed to get info_linear for prog FD %d\n", tgt_fd);
74                 return NULL;
75         }
76
77         if (info_linear->info.btf_id == 0 ||
78             btf__get_from_id(info_linear->info.btf_id, &btf)) {
79                 pr_debug("prog FD %d doesn't have valid btf\n", tgt_fd);
80                 goto out;
81         }
82
83         func_info = u64_to_ptr(info_linear->info.func_info);
84         t = btf__type_by_id(btf, func_info[0].type_id);
85         if (!t) {
86                 pr_debug("btf %d doesn't have type %d\n",
87                          info_linear->info.btf_id, func_info[0].type_id);
88                 goto out;
89         }
90         name = strdup(btf__name_by_offset(btf, t->name_off));
91 out:
92         btf__free(btf);
93         free(info_linear);
94         return name;
95 }
96
97 static int bpf_program_profiler_load_one(struct evsel *evsel, u32 prog_id)
98 {
99         struct bpf_prog_profiler_bpf *skel;
100         struct bpf_counter *counter;
101         struct bpf_program *prog;
102         char *prog_name;
103         int prog_fd;
104         int err;
105
106         prog_fd = bpf_prog_get_fd_by_id(prog_id);
107         if (prog_fd < 0) {
108                 pr_err("Failed to open fd for bpf prog %u\n", prog_id);
109                 return -1;
110         }
111         counter = bpf_counter_alloc();
112         if (!counter) {
113                 close(prog_fd);
114                 return -1;
115         }
116
117         skel = bpf_prog_profiler_bpf__open();
118         if (!skel) {
119                 pr_err("Failed to open bpf skeleton\n");
120                 goto err_out;
121         }
122
123         skel->rodata->num_cpu = evsel__nr_cpus(evsel);
124
125         bpf_map__resize(skel->maps.events, evsel__nr_cpus(evsel));
126         bpf_map__resize(skel->maps.fentry_readings, 1);
127         bpf_map__resize(skel->maps.accum_readings, 1);
128
129         prog_name = bpf_target_prog_name(prog_fd);
130         if (!prog_name) {
131                 pr_err("Failed to get program name for bpf prog %u. Does it have BTF?\n", prog_id);
132                 goto err_out;
133         }
134
135         bpf_object__for_each_program(prog, skel->obj) {
136                 err = bpf_program__set_attach_target(prog, prog_fd, prog_name);
137                 if (err) {
138                         pr_err("bpf_program__set_attach_target failed.\n"
139                                "Does bpf prog %u have BTF?\n", prog_id);
140                         goto err_out;
141                 }
142         }
143         set_max_rlimit();
144         err = bpf_prog_profiler_bpf__load(skel);
145         if (err) {
146                 pr_err("bpf_prog_profiler_bpf__load failed\n");
147                 goto err_out;
148         }
149
150         assert(skel != NULL);
151         counter->skel = skel;
152         list_add(&counter->list, &evsel->bpf_counter_list);
153         close(prog_fd);
154         return 0;
155 err_out:
156         bpf_prog_profiler_bpf__destroy(skel);
157         free(counter);
158         close(prog_fd);
159         return -1;
160 }
161
162 static int bpf_program_profiler__load(struct evsel *evsel, struct target *target)
163 {
164         char *bpf_str, *bpf_str_, *tok, *saveptr = NULL, *p;
165         u32 prog_id;
166         int ret;
167
168         bpf_str_ = bpf_str = strdup(target->bpf_str);
169         if (!bpf_str)
170                 return -1;
171
172         while ((tok = strtok_r(bpf_str, ",", &saveptr)) != NULL) {
173                 prog_id = strtoul(tok, &p, 10);
174                 if (prog_id == 0 || prog_id == UINT_MAX ||
175                     (*p != '\0' && *p != ',')) {
176                         pr_err("Failed to parse bpf prog ids %s\n",
177                                target->bpf_str);
178                         return -1;
179                 }
180
181                 ret = bpf_program_profiler_load_one(evsel, prog_id);
182                 if (ret) {
183                         bpf_program_profiler__destroy(evsel);
184                         free(bpf_str_);
185                         return -1;
186                 }
187                 bpf_str = NULL;
188         }
189         free(bpf_str_);
190         return 0;
191 }
192
193 static int bpf_program_profiler__enable(struct evsel *evsel)
194 {
195         struct bpf_counter *counter;
196         int ret;
197
198         list_for_each_entry(counter, &evsel->bpf_counter_list, list) {
199                 assert(counter->skel != NULL);
200                 ret = bpf_prog_profiler_bpf__attach(counter->skel);
201                 if (ret) {
202                         bpf_program_profiler__destroy(evsel);
203                         return ret;
204                 }
205         }
206         return 0;
207 }
208
209 static int bpf_program_profiler__disable(struct evsel *evsel)
210 {
211         struct bpf_counter *counter;
212
213         list_for_each_entry(counter, &evsel->bpf_counter_list, list) {
214                 assert(counter->skel != NULL);
215                 bpf_prog_profiler_bpf__detach(counter->skel);
216         }
217         return 0;
218 }
219
220 static int bpf_program_profiler__read(struct evsel *evsel)
221 {
222         // perf_cpu_map uses /sys/devices/system/cpu/online
223         int num_cpu = evsel__nr_cpus(evsel);
224         // BPF_MAP_TYPE_PERCPU_ARRAY uses /sys/devices/system/cpu/possible
225         // Sometimes possible > online, like on a Ryzen 3900X that has 24
226         // threads but its possible showed 0-31 -acme
227         int num_cpu_bpf = libbpf_num_possible_cpus();
228         struct bpf_perf_event_value values[num_cpu_bpf];
229         struct bpf_counter *counter;
230         int reading_map_fd;
231         __u32 key = 0;
232         int err, cpu;
233
234         if (list_empty(&evsel->bpf_counter_list))
235                 return -EAGAIN;
236
237         for (cpu = 0; cpu < num_cpu; cpu++) {
238                 perf_counts(evsel->counts, cpu, 0)->val = 0;
239                 perf_counts(evsel->counts, cpu, 0)->ena = 0;
240                 perf_counts(evsel->counts, cpu, 0)->run = 0;
241         }
242         list_for_each_entry(counter, &evsel->bpf_counter_list, list) {
243                 struct bpf_prog_profiler_bpf *skel = counter->skel;
244
245                 assert(skel != NULL);
246                 reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
247
248                 err = bpf_map_lookup_elem(reading_map_fd, &key, values);
249                 if (err) {
250                         pr_err("failed to read value\n");
251                         return err;
252                 }
253
254                 for (cpu = 0; cpu < num_cpu; cpu++) {
255                         perf_counts(evsel->counts, cpu, 0)->val += values[cpu].counter;
256                         perf_counts(evsel->counts, cpu, 0)->ena += values[cpu].enabled;
257                         perf_counts(evsel->counts, cpu, 0)->run += values[cpu].running;
258                 }
259         }
260         return 0;
261 }
262
263 static int bpf_program_profiler__install_pe(struct evsel *evsel, int cpu,
264                                             int fd)
265 {
266         struct bpf_prog_profiler_bpf *skel;
267         struct bpf_counter *counter;
268         int ret;
269
270         list_for_each_entry(counter, &evsel->bpf_counter_list, list) {
271                 skel = counter->skel;
272                 assert(skel != NULL);
273
274                 ret = bpf_map_update_elem(bpf_map__fd(skel->maps.events),
275                                           &cpu, &fd, BPF_ANY);
276                 if (ret)
277                         return ret;
278         }
279         return 0;
280 }
281
282 struct bpf_counter_ops bpf_program_profiler_ops = {
283         .load       = bpf_program_profiler__load,
284         .enable     = bpf_program_profiler__enable,
285         .disable    = bpf_program_profiler__disable,
286         .read       = bpf_program_profiler__read,
287         .destroy    = bpf_program_profiler__destroy,
288         .install_pe = bpf_program_profiler__install_pe,
289 };
290
291 static bool bperf_attr_map_compatible(int attr_map_fd)
292 {
293         struct bpf_map_info map_info = {0};
294         __u32 map_info_len = sizeof(map_info);
295         int err;
296
297         err = bpf_obj_get_info_by_fd(attr_map_fd, &map_info, &map_info_len);
298
299         if (err)
300                 return false;
301         return (map_info.key_size == sizeof(struct perf_event_attr)) &&
302                 (map_info.value_size == sizeof(struct perf_event_attr_map_entry));
303 }
304
305 static int bperf_lock_attr_map(struct target *target)
306 {
307         char path[PATH_MAX];
308         int map_fd, err;
309
310         if (target->attr_map) {
311                 scnprintf(path, PATH_MAX, "%s", target->attr_map);
312         } else {
313                 scnprintf(path, PATH_MAX, "%s/fs/bpf/%s", sysfs__mountpoint(),
314                           BPF_PERF_DEFAULT_ATTR_MAP_PATH);
315         }
316
317         if (access(path, F_OK)) {
318                 map_fd = bpf_create_map(BPF_MAP_TYPE_HASH,
319                                         sizeof(struct perf_event_attr),
320                                         sizeof(struct perf_event_attr_map_entry),
321                                         ATTR_MAP_SIZE, 0);
322                 if (map_fd < 0)
323                         return -1;
324
325                 err = bpf_obj_pin(map_fd, path);
326                 if (err) {
327                         /* someone pinned the map in parallel? */
328                         close(map_fd);
329                         map_fd = bpf_obj_get(path);
330                         if (map_fd < 0)
331                                 return -1;
332                 }
333         } else {
334                 map_fd = bpf_obj_get(path);
335                 if (map_fd < 0)
336                         return -1;
337         }
338
339         if (!bperf_attr_map_compatible(map_fd)) {
340                 close(map_fd);
341                 return -1;
342
343         }
344         err = flock(map_fd, LOCK_EX);
345         if (err) {
346                 close(map_fd);
347                 return -1;
348         }
349         return map_fd;
350 }
351
352 static int bperf_check_target(struct evsel *evsel,
353                               struct target *target,
354                               enum bperf_filter_type *filter_type,
355                               __u32 *filter_entry_cnt)
356 {
357         if (evsel->core.leader->nr_members > 1) {
358                 pr_err("bpf managed perf events do not yet support groups.\n");
359                 return -1;
360         }
361
362         /* determine filter type based on target */
363         if (target->system_wide) {
364                 *filter_type = BPERF_FILTER_GLOBAL;
365                 *filter_entry_cnt = 1;
366         } else if (target->cpu_list) {
367                 *filter_type = BPERF_FILTER_CPU;
368                 *filter_entry_cnt = perf_cpu_map__nr(evsel__cpus(evsel));
369         } else if (target->tid) {
370                 *filter_type = BPERF_FILTER_PID;
371                 *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads);
372         } else if (target->pid || evsel->evlist->workload.pid != -1) {
373                 *filter_type = BPERF_FILTER_TGID;
374                 *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads);
375         } else {
376                 pr_err("bpf managed perf events do not yet support these targets.\n");
377                 return -1;
378         }
379
380         return 0;
381 }
382
383 static  struct perf_cpu_map *all_cpu_map;
384
385 static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
386                                        struct perf_event_attr_map_entry *entry)
387 {
388         struct bperf_leader_bpf *skel = bperf_leader_bpf__open();
389         int link_fd, diff_map_fd, err;
390         struct bpf_link *link = NULL;
391
392         if (!skel) {
393                 pr_err("Failed to open leader skeleton\n");
394                 return -1;
395         }
396
397         bpf_map__resize(skel->maps.events, libbpf_num_possible_cpus());
398         err = bperf_leader_bpf__load(skel);
399         if (err) {
400                 pr_err("Failed to load leader skeleton\n");
401                 goto out;
402         }
403
404         link = bpf_program__attach(skel->progs.on_switch);
405         if (IS_ERR(link)) {
406                 pr_err("Failed to attach leader program\n");
407                 err = PTR_ERR(link);
408                 goto out;
409         }
410
411         link_fd = bpf_link__fd(link);
412         diff_map_fd = bpf_map__fd(skel->maps.diff_readings);
413         entry->link_id = bpf_link_get_id(link_fd);
414         entry->diff_map_id = bpf_map_get_id(diff_map_fd);
415         err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, entry, BPF_ANY);
416         assert(err == 0);
417
418         evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry->link_id);
419         assert(evsel->bperf_leader_link_fd >= 0);
420
421         /*
422          * save leader_skel for install_pe, which is called within
423          * following evsel__open_per_cpu call
424          */
425         evsel->leader_skel = skel;
426         evsel__open_per_cpu(evsel, all_cpu_map, -1);
427
428 out:
429         bperf_leader_bpf__destroy(skel);
430         bpf_link__destroy(link);
431         return err;
432 }
433
434 static int bperf__load(struct evsel *evsel, struct target *target)
435 {
436         struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff};
437         int attr_map_fd, diff_map_fd = -1, err;
438         enum bperf_filter_type filter_type;
439         __u32 filter_entry_cnt, i;
440
441         if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt))
442                 return -1;
443
444         if (!all_cpu_map) {
445                 all_cpu_map = perf_cpu_map__new(NULL);
446                 if (!all_cpu_map)
447                         return -1;
448         }
449
450         evsel->bperf_leader_prog_fd = -1;
451         evsel->bperf_leader_link_fd = -1;
452
453         /*
454          * Step 1: hold a fd on the leader program and the bpf_link, if
455          * the program is not already gone, reload the program.
456          * Use flock() to ensure exclusive access to the perf_event_attr
457          * map.
458          */
459         attr_map_fd = bperf_lock_attr_map(target);
460         if (attr_map_fd < 0) {
461                 pr_err("Failed to lock perf_event_attr map\n");
462                 return -1;
463         }
464
465         err = bpf_map_lookup_elem(attr_map_fd, &evsel->core.attr, &entry);
466         if (err) {
467                 err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, &entry, BPF_ANY);
468                 if (err)
469                         goto out;
470         }
471
472         evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry.link_id);
473         if (evsel->bperf_leader_link_fd < 0 &&
474             bperf_reload_leader_program(evsel, attr_map_fd, &entry)) {
475                 err = -1;
476                 goto out;
477         }
478         /*
479          * The bpf_link holds reference to the leader program, and the
480          * leader program holds reference to the maps. Therefore, if
481          * link_id is valid, diff_map_id should also be valid.
482          */
483         evsel->bperf_leader_prog_fd = bpf_prog_get_fd_by_id(
484                 bpf_link_get_prog_id(evsel->bperf_leader_link_fd));
485         assert(evsel->bperf_leader_prog_fd >= 0);
486
487         diff_map_fd = bpf_map_get_fd_by_id(entry.diff_map_id);
488         assert(diff_map_fd >= 0);
489
490         /*
491          * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check
492          * whether the kernel support it
493          */
494         err = bperf_trigger_reading(evsel->bperf_leader_prog_fd, 0);
495         if (err) {
496                 pr_err("The kernel does not support test_run for raw_tp BPF programs.\n"
497                        "Therefore, --use-bpf might show inaccurate readings\n");
498                 goto out;
499         }
500
501         /* Step 2: load the follower skeleton */
502         evsel->follower_skel = bperf_follower_bpf__open();
503         if (!evsel->follower_skel) {
504                 err = -1;
505                 pr_err("Failed to open follower skeleton\n");
506                 goto out;
507         }
508
509         /* attach fexit program to the leader program */
510         bpf_program__set_attach_target(evsel->follower_skel->progs.fexit_XXX,
511                                        evsel->bperf_leader_prog_fd, "on_switch");
512
513         /* connect to leader diff_reading map */
514         bpf_map__reuse_fd(evsel->follower_skel->maps.diff_readings, diff_map_fd);
515
516         /* set up reading map */
517         bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
518                                  filter_entry_cnt);
519         /* set up follower filter based on target */
520         bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
521                                  filter_entry_cnt);
522         err = bperf_follower_bpf__load(evsel->follower_skel);
523         if (err) {
524                 pr_err("Failed to load follower skeleton\n");
525                 bperf_follower_bpf__destroy(evsel->follower_skel);
526                 evsel->follower_skel = NULL;
527                 goto out;
528         }
529
530         for (i = 0; i < filter_entry_cnt; i++) {
531                 int filter_map_fd;
532                 __u32 key;
533
534                 if (filter_type == BPERF_FILTER_PID ||
535                     filter_type == BPERF_FILTER_TGID)
536                         key = evsel->core.threads->map[i].pid;
537                 else if (filter_type == BPERF_FILTER_CPU)
538                         key = evsel->core.cpus->map[i];
539                 else
540                         break;
541
542                 filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
543                 bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
544         }
545
546         evsel->follower_skel->bss->type = filter_type;
547
548         err = bperf_follower_bpf__attach(evsel->follower_skel);
549
550 out:
551         if (err && evsel->bperf_leader_link_fd >= 0)
552                 close(evsel->bperf_leader_link_fd);
553         if (err && evsel->bperf_leader_prog_fd >= 0)
554                 close(evsel->bperf_leader_prog_fd);
555         if (diff_map_fd >= 0)
556                 close(diff_map_fd);
557
558         flock(attr_map_fd, LOCK_UN);
559         close(attr_map_fd);
560
561         return err;
562 }
563
564 static int bperf__install_pe(struct evsel *evsel, int cpu, int fd)
565 {
566         struct bperf_leader_bpf *skel = evsel->leader_skel;
567
568         return bpf_map_update_elem(bpf_map__fd(skel->maps.events),
569                                    &cpu, &fd, BPF_ANY);
570 }
571
572 /*
573  * trigger the leader prog on each cpu, so the accum_reading map could get
574  * the latest readings.
575  */
576 static int bperf_sync_counters(struct evsel *evsel)
577 {
578         int num_cpu, i, cpu;
579
580         num_cpu = all_cpu_map->nr;
581         for (i = 0; i < num_cpu; i++) {
582                 cpu = all_cpu_map->map[i];
583                 bperf_trigger_reading(evsel->bperf_leader_prog_fd, cpu);
584         }
585         return 0;
586 }
587
588 static int bperf__enable(struct evsel *evsel)
589 {
590         evsel->follower_skel->bss->enabled = 1;
591         return 0;
592 }
593
594 static int bperf__disable(struct evsel *evsel)
595 {
596         evsel->follower_skel->bss->enabled = 0;
597         return 0;
598 }
599
600 static int bperf__read(struct evsel *evsel)
601 {
602         struct bperf_follower_bpf *skel = evsel->follower_skel;
603         __u32 num_cpu_bpf = cpu__max_cpu();
604         struct bpf_perf_event_value values[num_cpu_bpf];
605         int reading_map_fd, err = 0;
606         __u32 i, j, num_cpu;
607
608         bperf_sync_counters(evsel);
609         reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
610
611         for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
612                 __u32 cpu;
613
614                 err = bpf_map_lookup_elem(reading_map_fd, &i, values);
615                 if (err)
616                         goto out;
617                 switch (evsel->follower_skel->bss->type) {
618                 case BPERF_FILTER_GLOBAL:
619                         assert(i == 0);
620
621                         num_cpu = all_cpu_map->nr;
622                         for (j = 0; j < num_cpu; j++) {
623                                 cpu = all_cpu_map->map[j];
624                                 perf_counts(evsel->counts, cpu, 0)->val = values[cpu].counter;
625                                 perf_counts(evsel->counts, cpu, 0)->ena = values[cpu].enabled;
626                                 perf_counts(evsel->counts, cpu, 0)->run = values[cpu].running;
627                         }
628                         break;
629                 case BPERF_FILTER_CPU:
630                         cpu = evsel->core.cpus->map[i];
631                         perf_counts(evsel->counts, i, 0)->val = values[cpu].counter;
632                         perf_counts(evsel->counts, i, 0)->ena = values[cpu].enabled;
633                         perf_counts(evsel->counts, i, 0)->run = values[cpu].running;
634                         break;
635                 case BPERF_FILTER_PID:
636                 case BPERF_FILTER_TGID:
637                         perf_counts(evsel->counts, 0, i)->val = 0;
638                         perf_counts(evsel->counts, 0, i)->ena = 0;
639                         perf_counts(evsel->counts, 0, i)->run = 0;
640
641                         for (cpu = 0; cpu < num_cpu_bpf; cpu++) {
642                                 perf_counts(evsel->counts, 0, i)->val += values[cpu].counter;
643                                 perf_counts(evsel->counts, 0, i)->ena += values[cpu].enabled;
644                                 perf_counts(evsel->counts, 0, i)->run += values[cpu].running;
645                         }
646                         break;
647                 default:
648                         break;
649                 }
650         }
651 out:
652         return err;
653 }
654
655 static int bperf__destroy(struct evsel *evsel)
656 {
657         bperf_follower_bpf__destroy(evsel->follower_skel);
658         close(evsel->bperf_leader_prog_fd);
659         close(evsel->bperf_leader_link_fd);
660         return 0;
661 }
662
663 /*
664  * bperf: share hardware PMCs with BPF
665  *
666  * perf uses performance monitoring counters (PMC) to monitor system
667  * performance. The PMCs are limited hardware resources. For example,
668  * Intel CPUs have 3x fixed PMCs and 4x programmable PMCs per cpu.
669  *
670  * Modern data center systems use these PMCs in many different ways:
671  * system level monitoring, (maybe nested) container level monitoring, per
672  * process monitoring, profiling (in sample mode), etc. In some cases,
673  * there are more active perf_events than available hardware PMCs. To allow
674  * all perf_events to have a chance to run, it is necessary to do expensive
675  * time multiplexing of events.
676  *
677  * On the other hand, many monitoring tools count the common metrics
678  * (cycles, instructions). It is a waste to have multiple tools create
679  * multiple perf_events of "cycles" and occupy multiple PMCs.
680  *
681  * bperf tries to reduce such wastes by allowing multiple perf_events of
682  * "cycles" or "instructions" (at different scopes) to share PMUs. Instead
683  * of having each perf-stat session to read its own perf_events, bperf uses
684  * BPF programs to read the perf_events and aggregate readings to BPF maps.
685  * Then, the perf-stat session(s) reads the values from these BPF maps.
686  *
687  *                                ||
688  *       shared progs and maps <- || -> per session progs and maps
689  *                                ||
690  *   ---------------              ||
691  *   | perf_events |              ||
692  *   ---------------       fexit  ||      -----------------
693  *          |             --------||----> | follower prog |
694  *       --------------- /        || ---  -----------------
695  * cs -> | leader prog |/         ||/        |         |
696  *   --> ---------------         /||  --------------  ------------------
697  *  /       |         |         / ||  | filter map |  | accum_readings |
698  * /  ------------  ------------  ||  --------------  ------------------
699  * |  | prev map |  | diff map |  ||                        |
700  * |  ------------  ------------  ||                        |
701  *  \                             ||                        |
702  * = \ ==================================================== | ============
703  *    \                                                    /   user space
704  *     \                                                  /
705  *      \                                                /
706  *    BPF_PROG_TEST_RUN                    BPF_MAP_LOOKUP_ELEM
707  *        \                                            /
708  *         \                                          /
709  *          \------  perf-stat ----------------------/
710  *
711  * The figure above shows the architecture of bperf. Note that the figure
712  * is divided into 3 regions: shared progs and maps (top left), per session
713  * progs and maps (top right), and user space (bottom).
714  *
715  * The leader prog is triggered on each context switch (cs). The leader
716  * prog reads perf_events and stores the difference (current_reading -
717  * previous_reading) to the diff map. For the same metric, e.g. "cycles",
718  * multiple perf-stat sessions share the same leader prog.
719  *
720  * Each perf-stat session creates a follower prog as fexit program to the
721  * leader prog. It is possible to attach up to BPF_MAX_TRAMP_PROGS (38)
722  * follower progs to the same leader prog. The follower prog checks current
723  * task and processor ID to decide whether to add the value from the diff
724  * map to its accumulated reading map (accum_readings).
725  *
726  * Finally, perf-stat user space reads the value from accum_reading map.
727  *
728  * Besides context switch, it is also necessary to trigger the leader prog
729  * before perf-stat reads the value. Otherwise, the accum_reading map may
730  * not have the latest reading from the perf_events. This is achieved by
731  * triggering the event via sys_bpf(BPF_PROG_TEST_RUN) to each CPU.
732  *
733  * Comment before the definition of struct perf_event_attr_map_entry
734  * describes how different sessions of perf-stat share information about
735  * the leader prog.
736  */
737
738 struct bpf_counter_ops bperf_ops = {
739         .load       = bperf__load,
740         .enable     = bperf__enable,
741         .disable    = bperf__disable,
742         .read       = bperf__read,
743         .install_pe = bperf__install_pe,
744         .destroy    = bperf__destroy,
745 };
746
747 extern struct bpf_counter_ops bperf_cgrp_ops;
748
749 static inline bool bpf_counter_skip(struct evsel *evsel)
750 {
751         return list_empty(&evsel->bpf_counter_list) &&
752                 evsel->follower_skel == NULL;
753 }
754
755 int bpf_counter__install_pe(struct evsel *evsel, int cpu, int fd)
756 {
757         if (bpf_counter_skip(evsel))
758                 return 0;
759         return evsel->bpf_counter_ops->install_pe(evsel, cpu, fd);
760 }
761
762 int bpf_counter__load(struct evsel *evsel, struct target *target)
763 {
764         if (target->bpf_str)
765                 evsel->bpf_counter_ops = &bpf_program_profiler_ops;
766         else if (cgrp_event_expanded && target->use_bpf)
767                 evsel->bpf_counter_ops = &bperf_cgrp_ops;
768         else if (target->use_bpf || evsel->bpf_counter ||
769                  evsel__match_bpf_counter_events(evsel->name))
770                 evsel->bpf_counter_ops = &bperf_ops;
771
772         if (evsel->bpf_counter_ops)
773                 return evsel->bpf_counter_ops->load(evsel, target);
774         return 0;
775 }
776
777 int bpf_counter__enable(struct evsel *evsel)
778 {
779         if (bpf_counter_skip(evsel))
780                 return 0;
781         return evsel->bpf_counter_ops->enable(evsel);
782 }
783
784 int bpf_counter__disable(struct evsel *evsel)
785 {
786         if (bpf_counter_skip(evsel))
787                 return 0;
788         return evsel->bpf_counter_ops->disable(evsel);
789 }
790
791 int bpf_counter__read(struct evsel *evsel)
792 {
793         if (bpf_counter_skip(evsel))
794                 return -EAGAIN;
795         return evsel->bpf_counter_ops->read(evsel);
796 }
797
798 void bpf_counter__destroy(struct evsel *evsel)
799 {
800         if (bpf_counter_skip(evsel))
801                 return;
802         evsel->bpf_counter_ops->destroy(evsel);
803         evsel->bpf_counter_ops = NULL;
804 }