perf build: Make BUILD_BPF_SKEL default, rename to NO_BPF_SKEL
[linux-2.6-microblaze.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/llvm-utils.h"
41 #include "util/bpf-loader.h"
42 #include "util/trigger.h"
43 #include "util/perf-hooks.h"
44 #include "util/cpu-set-sched.h"
45 #include "util/synthetic-events.h"
46 #include "util/time-utils.h"
47 #include "util/units.h"
48 #include "util/bpf-event.h"
49 #include "util/util.h"
50 #include "util/pfm.h"
51 #include "util/clockid.h"
52 #include "util/pmu-hybrid.h"
53 #include "util/evlist-hybrid.h"
54 #include "util/off_cpu.h"
55 #include "asm/bug.h"
56 #include "perf.h"
57 #include "cputopo.h"
58
59 #include <errno.h>
60 #include <inttypes.h>
61 #include <locale.h>
62 #include <poll.h>
63 #include <pthread.h>
64 #include <unistd.h>
65 #ifndef HAVE_GETTID
66 #include <syscall.h>
67 #endif
68 #include <sched.h>
69 #include <signal.h>
70 #ifdef HAVE_EVENTFD_SUPPORT
71 #include <sys/eventfd.h>
72 #endif
73 #include <sys/mman.h>
74 #include <sys/wait.h>
75 #include <sys/types.h>
76 #include <sys/stat.h>
77 #include <fcntl.h>
78 #include <linux/err.h>
79 #include <linux/string.h>
80 #include <linux/time64.h>
81 #include <linux/zalloc.h>
82 #include <linux/bitmap.h>
83 #include <sys/time.h>
84
85 struct switch_output {
86         bool             enabled;
87         bool             signal;
88         unsigned long    size;
89         unsigned long    time;
90         const char      *str;
91         bool             set;
92         char             **filenames;
93         int              num_files;
94         int              cur_file;
95 };
96
97 struct thread_mask {
98         struct mmap_cpu_mask    maps;
99         struct mmap_cpu_mask    affinity;
100 };
101
102 struct record_thread {
103         pid_t                   tid;
104         struct thread_mask      *mask;
105         struct {
106                 int             msg[2];
107                 int             ack[2];
108         } pipes;
109         struct fdarray          pollfd;
110         int                     ctlfd_pos;
111         int                     nr_mmaps;
112         struct mmap             **maps;
113         struct mmap             **overwrite_maps;
114         struct record           *rec;
115         unsigned long long      samples;
116         unsigned long           waking;
117         u64                     bytes_written;
118         u64                     bytes_transferred;
119         u64                     bytes_compressed;
120 };
121
122 static __thread struct record_thread *thread;
123
124 enum thread_msg {
125         THREAD_MSG__UNDEFINED = 0,
126         THREAD_MSG__READY,
127         THREAD_MSG__MAX,
128 };
129
130 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
131         "UNDEFINED", "READY"
132 };
133
134 enum thread_spec {
135         THREAD_SPEC__UNDEFINED = 0,
136         THREAD_SPEC__CPU,
137         THREAD_SPEC__CORE,
138         THREAD_SPEC__PACKAGE,
139         THREAD_SPEC__NUMA,
140         THREAD_SPEC__USER,
141         THREAD_SPEC__MAX,
142 };
143
144 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
145         "undefined", "cpu", "core", "package", "numa", "user"
146 };
147
148 struct pollfd_index_map {
149         int evlist_pollfd_index;
150         int thread_pollfd_index;
151 };
152
153 struct record {
154         struct perf_tool        tool;
155         struct record_opts      opts;
156         u64                     bytes_written;
157         u64                     thread_bytes_written;
158         struct perf_data        data;
159         struct auxtrace_record  *itr;
160         struct evlist   *evlist;
161         struct perf_session     *session;
162         struct evlist           *sb_evlist;
163         pthread_t               thread_id;
164         int                     realtime_prio;
165         bool                    switch_output_event_set;
166         bool                    no_buildid;
167         bool                    no_buildid_set;
168         bool                    no_buildid_cache;
169         bool                    no_buildid_cache_set;
170         bool                    buildid_all;
171         bool                    buildid_mmap;
172         bool                    timestamp_filename;
173         bool                    timestamp_boundary;
174         bool                    off_cpu;
175         struct switch_output    switch_output;
176         unsigned long long      samples;
177         unsigned long           output_max_size;        /* = 0: unlimited */
178         struct perf_debuginfod  debuginfod;
179         int                     nr_threads;
180         struct thread_mask      *thread_masks;
181         struct record_thread    *thread_data;
182         struct pollfd_index_map *index_map;
183         size_t                  index_map_sz;
184         size_t                  index_map_cnt;
185 };
186
187 static volatile int done;
188
189 static volatile int auxtrace_record__snapshot_started;
190 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
191 static DEFINE_TRIGGER(switch_output_trigger);
192
193 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
194         "SYS", "NODE", "CPU"
195 };
196
197 #ifndef HAVE_GETTID
198 static inline pid_t gettid(void)
199 {
200         return (pid_t)syscall(__NR_gettid);
201 }
202 #endif
203
204 static int record__threads_enabled(struct record *rec)
205 {
206         return rec->opts.threads_spec;
207 }
208
209 static bool switch_output_signal(struct record *rec)
210 {
211         return rec->switch_output.signal &&
212                trigger_is_ready(&switch_output_trigger);
213 }
214
215 static bool switch_output_size(struct record *rec)
216 {
217         return rec->switch_output.size &&
218                trigger_is_ready(&switch_output_trigger) &&
219                (rec->bytes_written >= rec->switch_output.size);
220 }
221
222 static bool switch_output_time(struct record *rec)
223 {
224         return rec->switch_output.time &&
225                trigger_is_ready(&switch_output_trigger);
226 }
227
228 static u64 record__bytes_written(struct record *rec)
229 {
230         return rec->bytes_written + rec->thread_bytes_written;
231 }
232
233 static bool record__output_max_size_exceeded(struct record *rec)
234 {
235         return rec->output_max_size &&
236                (record__bytes_written(rec) >= rec->output_max_size);
237 }
238
239 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
240                          void *bf, size_t size)
241 {
242         struct perf_data_file *file = &rec->session->data->file;
243
244         if (map && map->file)
245                 file = map->file;
246
247         if (perf_data_file__write(file, bf, size) < 0) {
248                 pr_err("failed to write perf data, error: %m\n");
249                 return -1;
250         }
251
252         if (map && map->file) {
253                 thread->bytes_written += size;
254                 rec->thread_bytes_written += size;
255         } else {
256                 rec->bytes_written += size;
257         }
258
259         if (record__output_max_size_exceeded(rec) && !done) {
260                 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
261                                 " stopping session ]\n",
262                                 record__bytes_written(rec) >> 10);
263                 done = 1;
264         }
265
266         if (switch_output_size(rec))
267                 trigger_hit(&switch_output_trigger);
268
269         return 0;
270 }
271
272 static int record__aio_enabled(struct record *rec);
273 static int record__comp_enabled(struct record *rec);
274 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
275                             void *dst, size_t dst_size, void *src, size_t src_size);
276
277 #ifdef HAVE_AIO_SUPPORT
278 static int record__aio_write(struct aiocb *cblock, int trace_fd,
279                 void *buf, size_t size, off_t off)
280 {
281         int rc;
282
283         cblock->aio_fildes = trace_fd;
284         cblock->aio_buf    = buf;
285         cblock->aio_nbytes = size;
286         cblock->aio_offset = off;
287         cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
288
289         do {
290                 rc = aio_write(cblock);
291                 if (rc == 0) {
292                         break;
293                 } else if (errno != EAGAIN) {
294                         cblock->aio_fildes = -1;
295                         pr_err("failed to queue perf data, error: %m\n");
296                         break;
297                 }
298         } while (1);
299
300         return rc;
301 }
302
303 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
304 {
305         void *rem_buf;
306         off_t rem_off;
307         size_t rem_size;
308         int rc, aio_errno;
309         ssize_t aio_ret, written;
310
311         aio_errno = aio_error(cblock);
312         if (aio_errno == EINPROGRESS)
313                 return 0;
314
315         written = aio_ret = aio_return(cblock);
316         if (aio_ret < 0) {
317                 if (aio_errno != EINTR)
318                         pr_err("failed to write perf data, error: %m\n");
319                 written = 0;
320         }
321
322         rem_size = cblock->aio_nbytes - written;
323
324         if (rem_size == 0) {
325                 cblock->aio_fildes = -1;
326                 /*
327                  * md->refcount is incremented in record__aio_pushfn() for
328                  * every aio write request started in record__aio_push() so
329                  * decrement it because the request is now complete.
330                  */
331                 perf_mmap__put(&md->core);
332                 rc = 1;
333         } else {
334                 /*
335                  * aio write request may require restart with the
336                  * reminder if the kernel didn't write whole
337                  * chunk at once.
338                  */
339                 rem_off = cblock->aio_offset + written;
340                 rem_buf = (void *)(cblock->aio_buf + written);
341                 record__aio_write(cblock, cblock->aio_fildes,
342                                 rem_buf, rem_size, rem_off);
343                 rc = 0;
344         }
345
346         return rc;
347 }
348
349 static int record__aio_sync(struct mmap *md, bool sync_all)
350 {
351         struct aiocb **aiocb = md->aio.aiocb;
352         struct aiocb *cblocks = md->aio.cblocks;
353         struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
354         int i, do_suspend;
355
356         do {
357                 do_suspend = 0;
358                 for (i = 0; i < md->aio.nr_cblocks; ++i) {
359                         if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
360                                 if (sync_all)
361                                         aiocb[i] = NULL;
362                                 else
363                                         return i;
364                         } else {
365                                 /*
366                                  * Started aio write is not complete yet
367                                  * so it has to be waited before the
368                                  * next allocation.
369                                  */
370                                 aiocb[i] = &cblocks[i];
371                                 do_suspend = 1;
372                         }
373                 }
374                 if (!do_suspend)
375                         return -1;
376
377                 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
378                         if (!(errno == EAGAIN || errno == EINTR))
379                                 pr_err("failed to sync perf data, error: %m\n");
380                 }
381         } while (1);
382 }
383
384 struct record_aio {
385         struct record   *rec;
386         void            *data;
387         size_t          size;
388 };
389
390 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
391 {
392         struct record_aio *aio = to;
393
394         /*
395          * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
396          * to release space in the kernel buffer as fast as possible, calling
397          * perf_mmap__consume() from perf_mmap__push() function.
398          *
399          * That lets the kernel to proceed with storing more profiling data into
400          * the kernel buffer earlier than other per-cpu kernel buffers are handled.
401          *
402          * Coping can be done in two steps in case the chunk of profiling data
403          * crosses the upper bound of the kernel buffer. In this case we first move
404          * part of data from map->start till the upper bound and then the reminder
405          * from the beginning of the kernel buffer till the end of the data chunk.
406          */
407
408         if (record__comp_enabled(aio->rec)) {
409                 size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
410                                      mmap__mmap_len(map) - aio->size,
411                                      buf, size);
412         } else {
413                 memcpy(aio->data + aio->size, buf, size);
414         }
415
416         if (!aio->size) {
417                 /*
418                  * Increment map->refcount to guard map->aio.data[] buffer
419                  * from premature deallocation because map object can be
420                  * released earlier than aio write request started on
421                  * map->aio.data[] buffer is complete.
422                  *
423                  * perf_mmap__put() is done at record__aio_complete()
424                  * after started aio request completion or at record__aio_push()
425                  * if the request failed to start.
426                  */
427                 perf_mmap__get(&map->core);
428         }
429
430         aio->size += size;
431
432         return size;
433 }
434
435 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
436 {
437         int ret, idx;
438         int trace_fd = rec->session->data->file.fd;
439         struct record_aio aio = { .rec = rec, .size = 0 };
440
441         /*
442          * Call record__aio_sync() to wait till map->aio.data[] buffer
443          * becomes available after previous aio write operation.
444          */
445
446         idx = record__aio_sync(map, false);
447         aio.data = map->aio.data[idx];
448         ret = perf_mmap__push(map, &aio, record__aio_pushfn);
449         if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
450                 return ret;
451
452         rec->samples++;
453         ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
454         if (!ret) {
455                 *off += aio.size;
456                 rec->bytes_written += aio.size;
457                 if (switch_output_size(rec))
458                         trigger_hit(&switch_output_trigger);
459         } else {
460                 /*
461                  * Decrement map->refcount incremented in record__aio_pushfn()
462                  * back if record__aio_write() operation failed to start, otherwise
463                  * map->refcount is decremented in record__aio_complete() after
464                  * aio write operation finishes successfully.
465                  */
466                 perf_mmap__put(&map->core);
467         }
468
469         return ret;
470 }
471
472 static off_t record__aio_get_pos(int trace_fd)
473 {
474         return lseek(trace_fd, 0, SEEK_CUR);
475 }
476
477 static void record__aio_set_pos(int trace_fd, off_t pos)
478 {
479         lseek(trace_fd, pos, SEEK_SET);
480 }
481
482 static void record__aio_mmap_read_sync(struct record *rec)
483 {
484         int i;
485         struct evlist *evlist = rec->evlist;
486         struct mmap *maps = evlist->mmap;
487
488         if (!record__aio_enabled(rec))
489                 return;
490
491         for (i = 0; i < evlist->core.nr_mmaps; i++) {
492                 struct mmap *map = &maps[i];
493
494                 if (map->core.base)
495                         record__aio_sync(map, true);
496         }
497 }
498
499 static int nr_cblocks_default = 1;
500 static int nr_cblocks_max = 4;
501
502 static int record__aio_parse(const struct option *opt,
503                              const char *str,
504                              int unset)
505 {
506         struct record_opts *opts = (struct record_opts *)opt->value;
507
508         if (unset) {
509                 opts->nr_cblocks = 0;
510         } else {
511                 if (str)
512                         opts->nr_cblocks = strtol(str, NULL, 0);
513                 if (!opts->nr_cblocks)
514                         opts->nr_cblocks = nr_cblocks_default;
515         }
516
517         return 0;
518 }
519 #else /* HAVE_AIO_SUPPORT */
520 static int nr_cblocks_max = 0;
521
522 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
523                             off_t *off __maybe_unused)
524 {
525         return -1;
526 }
527
528 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
529 {
530         return -1;
531 }
532
533 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
534 {
535 }
536
537 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
538 {
539 }
540 #endif
541
542 static int record__aio_enabled(struct record *rec)
543 {
544         return rec->opts.nr_cblocks > 0;
545 }
546
547 #define MMAP_FLUSH_DEFAULT 1
548 static int record__mmap_flush_parse(const struct option *opt,
549                                     const char *str,
550                                     int unset)
551 {
552         int flush_max;
553         struct record_opts *opts = (struct record_opts *)opt->value;
554         static struct parse_tag tags[] = {
555                         { .tag  = 'B', .mult = 1       },
556                         { .tag  = 'K', .mult = 1 << 10 },
557                         { .tag  = 'M', .mult = 1 << 20 },
558                         { .tag  = 'G', .mult = 1 << 30 },
559                         { .tag  = 0 },
560         };
561
562         if (unset)
563                 return 0;
564
565         if (str) {
566                 opts->mmap_flush = parse_tag_value(str, tags);
567                 if (opts->mmap_flush == (int)-1)
568                         opts->mmap_flush = strtol(str, NULL, 0);
569         }
570
571         if (!opts->mmap_flush)
572                 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
573
574         flush_max = evlist__mmap_size(opts->mmap_pages);
575         flush_max /= 4;
576         if (opts->mmap_flush > flush_max)
577                 opts->mmap_flush = flush_max;
578
579         return 0;
580 }
581
582 #ifdef HAVE_ZSTD_SUPPORT
583 static unsigned int comp_level_default = 1;
584
585 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
586 {
587         struct record_opts *opts = opt->value;
588
589         if (unset) {
590                 opts->comp_level = 0;
591         } else {
592                 if (str)
593                         opts->comp_level = strtol(str, NULL, 0);
594                 if (!opts->comp_level)
595                         opts->comp_level = comp_level_default;
596         }
597
598         return 0;
599 }
600 #endif
601 static unsigned int comp_level_max = 22;
602
603 static int record__comp_enabled(struct record *rec)
604 {
605         return rec->opts.comp_level > 0;
606 }
607
608 static int process_synthesized_event(struct perf_tool *tool,
609                                      union perf_event *event,
610                                      struct perf_sample *sample __maybe_unused,
611                                      struct machine *machine __maybe_unused)
612 {
613         struct record *rec = container_of(tool, struct record, tool);
614         return record__write(rec, NULL, event, event->header.size);
615 }
616
617 static struct mutex synth_lock;
618
619 static int process_locked_synthesized_event(struct perf_tool *tool,
620                                      union perf_event *event,
621                                      struct perf_sample *sample __maybe_unused,
622                                      struct machine *machine __maybe_unused)
623 {
624         int ret;
625
626         mutex_lock(&synth_lock);
627         ret = process_synthesized_event(tool, event, sample, machine);
628         mutex_unlock(&synth_lock);
629         return ret;
630 }
631
632 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
633 {
634         struct record *rec = to;
635
636         if (record__comp_enabled(rec)) {
637                 size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
638                 bf   = map->data;
639         }
640
641         thread->samples++;
642         return record__write(rec, map, bf, size);
643 }
644
645 static volatile sig_atomic_t signr = -1;
646 static volatile sig_atomic_t child_finished;
647 #ifdef HAVE_EVENTFD_SUPPORT
648 static volatile sig_atomic_t done_fd = -1;
649 #endif
650
651 static void sig_handler(int sig)
652 {
653         if (sig == SIGCHLD)
654                 child_finished = 1;
655         else
656                 signr = sig;
657
658         done = 1;
659 #ifdef HAVE_EVENTFD_SUPPORT
660         if (done_fd >= 0) {
661                 u64 tmp = 1;
662                 int orig_errno = errno;
663
664                 /*
665                  * It is possible for this signal handler to run after done is
666                  * checked in the main loop, but before the perf counter fds are
667                  * polled. If this happens, the poll() will continue to wait
668                  * even though done is set, and will only break out if either
669                  * another signal is received, or the counters are ready for
670                  * read. To ensure the poll() doesn't sleep when done is set,
671                  * use an eventfd (done_fd) to wake up the poll().
672                  */
673                 if (write(done_fd, &tmp, sizeof(tmp)) < 0)
674                         pr_err("failed to signal wakeup fd, error: %m\n");
675
676                 errno = orig_errno;
677         }
678 #endif // HAVE_EVENTFD_SUPPORT
679 }
680
681 static void sigsegv_handler(int sig)
682 {
683         perf_hooks__recover();
684         sighandler_dump_stack(sig);
685 }
686
687 static void record__sig_exit(void)
688 {
689         if (signr == -1)
690                 return;
691
692         signal(signr, SIG_DFL);
693         raise(signr);
694 }
695
696 #ifdef HAVE_AUXTRACE_SUPPORT
697
698 static int record__process_auxtrace(struct perf_tool *tool,
699                                     struct mmap *map,
700                                     union perf_event *event, void *data1,
701                                     size_t len1, void *data2, size_t len2)
702 {
703         struct record *rec = container_of(tool, struct record, tool);
704         struct perf_data *data = &rec->data;
705         size_t padding;
706         u8 pad[8] = {0};
707
708         if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
709                 off_t file_offset;
710                 int fd = perf_data__fd(data);
711                 int err;
712
713                 file_offset = lseek(fd, 0, SEEK_CUR);
714                 if (file_offset == -1)
715                         return -1;
716                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
717                                                      event, file_offset);
718                 if (err)
719                         return err;
720         }
721
722         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
723         padding = (len1 + len2) & 7;
724         if (padding)
725                 padding = 8 - padding;
726
727         record__write(rec, map, event, event->header.size);
728         record__write(rec, map, data1, len1);
729         if (len2)
730                 record__write(rec, map, data2, len2);
731         record__write(rec, map, &pad, padding);
732
733         return 0;
734 }
735
736 static int record__auxtrace_mmap_read(struct record *rec,
737                                       struct mmap *map)
738 {
739         int ret;
740
741         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
742                                   record__process_auxtrace);
743         if (ret < 0)
744                 return ret;
745
746         if (ret)
747                 rec->samples++;
748
749         return 0;
750 }
751
752 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
753                                                struct mmap *map)
754 {
755         int ret;
756
757         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
758                                            record__process_auxtrace,
759                                            rec->opts.auxtrace_snapshot_size);
760         if (ret < 0)
761                 return ret;
762
763         if (ret)
764                 rec->samples++;
765
766         return 0;
767 }
768
769 static int record__auxtrace_read_snapshot_all(struct record *rec)
770 {
771         int i;
772         int rc = 0;
773
774         for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
775                 struct mmap *map = &rec->evlist->mmap[i];
776
777                 if (!map->auxtrace_mmap.base)
778                         continue;
779
780                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
781                         rc = -1;
782                         goto out;
783                 }
784         }
785 out:
786         return rc;
787 }
788
789 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
790 {
791         pr_debug("Recording AUX area tracing snapshot\n");
792         if (record__auxtrace_read_snapshot_all(rec) < 0) {
793                 trigger_error(&auxtrace_snapshot_trigger);
794         } else {
795                 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
796                         trigger_error(&auxtrace_snapshot_trigger);
797                 else
798                         trigger_ready(&auxtrace_snapshot_trigger);
799         }
800 }
801
802 static int record__auxtrace_snapshot_exit(struct record *rec)
803 {
804         if (trigger_is_error(&auxtrace_snapshot_trigger))
805                 return 0;
806
807         if (!auxtrace_record__snapshot_started &&
808             auxtrace_record__snapshot_start(rec->itr))
809                 return -1;
810
811         record__read_auxtrace_snapshot(rec, true);
812         if (trigger_is_error(&auxtrace_snapshot_trigger))
813                 return -1;
814
815         return 0;
816 }
817
818 static int record__auxtrace_init(struct record *rec)
819 {
820         int err;
821
822         if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
823             && record__threads_enabled(rec)) {
824                 pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
825                 return -EINVAL;
826         }
827
828         if (!rec->itr) {
829                 rec->itr = auxtrace_record__init(rec->evlist, &err);
830                 if (err)
831                         return err;
832         }
833
834         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
835                                               rec->opts.auxtrace_snapshot_opts);
836         if (err)
837                 return err;
838
839         err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
840                                             rec->opts.auxtrace_sample_opts);
841         if (err)
842                 return err;
843
844         auxtrace_regroup_aux_output(rec->evlist);
845
846         return auxtrace_parse_filters(rec->evlist);
847 }
848
849 #else
850
851 static inline
852 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
853                                struct mmap *map __maybe_unused)
854 {
855         return 0;
856 }
857
858 static inline
859 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
860                                     bool on_exit __maybe_unused)
861 {
862 }
863
864 static inline
865 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
866 {
867         return 0;
868 }
869
870 static inline
871 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
872 {
873         return 0;
874 }
875
876 static int record__auxtrace_init(struct record *rec __maybe_unused)
877 {
878         return 0;
879 }
880
881 #endif
882
883 static int record__config_text_poke(struct evlist *evlist)
884 {
885         struct evsel *evsel;
886
887         /* Nothing to do if text poke is already configured */
888         evlist__for_each_entry(evlist, evsel) {
889                 if (evsel->core.attr.text_poke)
890                         return 0;
891         }
892
893         evsel = evlist__add_dummy_on_all_cpus(evlist);
894         if (!evsel)
895                 return -ENOMEM;
896
897         evsel->core.attr.text_poke = 1;
898         evsel->core.attr.ksymbol = 1;
899         evsel->immediate = true;
900         evsel__set_sample_bit(evsel, TIME);
901
902         return 0;
903 }
904
905 static int record__config_off_cpu(struct record *rec)
906 {
907         return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
908 }
909
910 static bool record__kcore_readable(struct machine *machine)
911 {
912         char kcore[PATH_MAX];
913         int fd;
914
915         scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
916
917         fd = open(kcore, O_RDONLY);
918         if (fd < 0)
919                 return false;
920
921         close(fd);
922
923         return true;
924 }
925
926 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
927 {
928         char from_dir[PATH_MAX];
929         char kcore_dir[PATH_MAX];
930         int ret;
931
932         snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
933
934         ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
935         if (ret)
936                 return ret;
937
938         return kcore_copy(from_dir, kcore_dir);
939 }
940
941 static void record__thread_data_init_pipes(struct record_thread *thread_data)
942 {
943         thread_data->pipes.msg[0] = -1;
944         thread_data->pipes.msg[1] = -1;
945         thread_data->pipes.ack[0] = -1;
946         thread_data->pipes.ack[1] = -1;
947 }
948
949 static int record__thread_data_open_pipes(struct record_thread *thread_data)
950 {
951         if (pipe(thread_data->pipes.msg))
952                 return -EINVAL;
953
954         if (pipe(thread_data->pipes.ack)) {
955                 close(thread_data->pipes.msg[0]);
956                 thread_data->pipes.msg[0] = -1;
957                 close(thread_data->pipes.msg[1]);
958                 thread_data->pipes.msg[1] = -1;
959                 return -EINVAL;
960         }
961
962         pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
963                  thread_data->pipes.msg[0], thread_data->pipes.msg[1],
964                  thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
965
966         return 0;
967 }
968
969 static void record__thread_data_close_pipes(struct record_thread *thread_data)
970 {
971         if (thread_data->pipes.msg[0] != -1) {
972                 close(thread_data->pipes.msg[0]);
973                 thread_data->pipes.msg[0] = -1;
974         }
975         if (thread_data->pipes.msg[1] != -1) {
976                 close(thread_data->pipes.msg[1]);
977                 thread_data->pipes.msg[1] = -1;
978         }
979         if (thread_data->pipes.ack[0] != -1) {
980                 close(thread_data->pipes.ack[0]);
981                 thread_data->pipes.ack[0] = -1;
982         }
983         if (thread_data->pipes.ack[1] != -1) {
984                 close(thread_data->pipes.ack[1]);
985                 thread_data->pipes.ack[1] = -1;
986         }
987 }
988
989 static bool evlist__per_thread(struct evlist *evlist)
990 {
991         return cpu_map__is_dummy(evlist->core.user_requested_cpus);
992 }
993
994 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
995 {
996         int m, tm, nr_mmaps = evlist->core.nr_mmaps;
997         struct mmap *mmap = evlist->mmap;
998         struct mmap *overwrite_mmap = evlist->overwrite_mmap;
999         struct perf_cpu_map *cpus = evlist->core.all_cpus;
1000         bool per_thread = evlist__per_thread(evlist);
1001
1002         if (per_thread)
1003                 thread_data->nr_mmaps = nr_mmaps;
1004         else
1005                 thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1006                                                       thread_data->mask->maps.nbits);
1007         if (mmap) {
1008                 thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1009                 if (!thread_data->maps)
1010                         return -ENOMEM;
1011         }
1012         if (overwrite_mmap) {
1013                 thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1014                 if (!thread_data->overwrite_maps) {
1015                         zfree(&thread_data->maps);
1016                         return -ENOMEM;
1017                 }
1018         }
1019         pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1020                  thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1021
1022         for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1023                 if (per_thread ||
1024                     test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1025                         if (thread_data->maps) {
1026                                 thread_data->maps[tm] = &mmap[m];
1027                                 pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1028                                           thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1029                         }
1030                         if (thread_data->overwrite_maps) {
1031                                 thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1032                                 pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1033                                           thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1034                         }
1035                         tm++;
1036                 }
1037         }
1038
1039         return 0;
1040 }
1041
1042 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1043 {
1044         int f, tm, pos;
1045         struct mmap *map, *overwrite_map;
1046
1047         fdarray__init(&thread_data->pollfd, 64);
1048
1049         for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1050                 map = thread_data->maps ? thread_data->maps[tm] : NULL;
1051                 overwrite_map = thread_data->overwrite_maps ?
1052                                 thread_data->overwrite_maps[tm] : NULL;
1053
1054                 for (f = 0; f < evlist->core.pollfd.nr; f++) {
1055                         void *ptr = evlist->core.pollfd.priv[f].ptr;
1056
1057                         if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1058                                 pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1059                                                               &evlist->core.pollfd);
1060                                 if (pos < 0)
1061                                         return pos;
1062                                 pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1063                                          thread_data, pos, evlist->core.pollfd.entries[f].fd);
1064                         }
1065                 }
1066         }
1067
1068         return 0;
1069 }
1070
1071 static void record__free_thread_data(struct record *rec)
1072 {
1073         int t;
1074         struct record_thread *thread_data = rec->thread_data;
1075
1076         if (thread_data == NULL)
1077                 return;
1078
1079         for (t = 0; t < rec->nr_threads; t++) {
1080                 record__thread_data_close_pipes(&thread_data[t]);
1081                 zfree(&thread_data[t].maps);
1082                 zfree(&thread_data[t].overwrite_maps);
1083                 fdarray__exit(&thread_data[t].pollfd);
1084         }
1085
1086         zfree(&rec->thread_data);
1087 }
1088
1089 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1090                                                     int evlist_pollfd_index,
1091                                                     int thread_pollfd_index)
1092 {
1093         size_t x = rec->index_map_cnt;
1094
1095         if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1096                 return -ENOMEM;
1097         rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1098         rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1099         rec->index_map_cnt += 1;
1100         return 0;
1101 }
1102
1103 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1104                                                     struct evlist *evlist,
1105                                                     struct record_thread *thread_data)
1106 {
1107         struct pollfd *e_entries = evlist->core.pollfd.entries;
1108         struct pollfd *t_entries = thread_data->pollfd.entries;
1109         int err = 0;
1110         size_t i;
1111
1112         for (i = 0; i < rec->index_map_cnt; i++) {
1113                 int e_pos = rec->index_map[i].evlist_pollfd_index;
1114                 int t_pos = rec->index_map[i].thread_pollfd_index;
1115
1116                 if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1117                     e_entries[e_pos].events != t_entries[t_pos].events) {
1118                         pr_err("Thread and evlist pollfd index mismatch\n");
1119                         err = -EINVAL;
1120                         continue;
1121                 }
1122                 e_entries[e_pos].revents = t_entries[t_pos].revents;
1123         }
1124         return err;
1125 }
1126
1127 static int record__dup_non_perf_events(struct record *rec,
1128                                        struct evlist *evlist,
1129                                        struct record_thread *thread_data)
1130 {
1131         struct fdarray *fda = &evlist->core.pollfd;
1132         int i, ret;
1133
1134         for (i = 0; i < fda->nr; i++) {
1135                 if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1136                         continue;
1137                 ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1138                 if (ret < 0) {
1139                         pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1140                         return ret;
1141                 }
1142                 pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1143                           thread_data, ret, fda->entries[i].fd);
1144                 ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1145                 if (ret < 0) {
1146                         pr_err("Failed to map thread and evlist pollfd indexes\n");
1147                         return ret;
1148                 }
1149         }
1150         return 0;
1151 }
1152
1153 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1154 {
1155         int t, ret;
1156         struct record_thread *thread_data;
1157
1158         rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1159         if (!rec->thread_data) {
1160                 pr_err("Failed to allocate thread data\n");
1161                 return -ENOMEM;
1162         }
1163         thread_data = rec->thread_data;
1164
1165         for (t = 0; t < rec->nr_threads; t++)
1166                 record__thread_data_init_pipes(&thread_data[t]);
1167
1168         for (t = 0; t < rec->nr_threads; t++) {
1169                 thread_data[t].rec = rec;
1170                 thread_data[t].mask = &rec->thread_masks[t];
1171                 ret = record__thread_data_init_maps(&thread_data[t], evlist);
1172                 if (ret) {
1173                         pr_err("Failed to initialize thread[%d] maps\n", t);
1174                         goto out_free;
1175                 }
1176                 ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1177                 if (ret) {
1178                         pr_err("Failed to initialize thread[%d] pollfd\n", t);
1179                         goto out_free;
1180                 }
1181                 if (t) {
1182                         thread_data[t].tid = -1;
1183                         ret = record__thread_data_open_pipes(&thread_data[t]);
1184                         if (ret) {
1185                                 pr_err("Failed to open thread[%d] communication pipes\n", t);
1186                                 goto out_free;
1187                         }
1188                         ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1189                                            POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1190                         if (ret < 0) {
1191                                 pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1192                                 goto out_free;
1193                         }
1194                         thread_data[t].ctlfd_pos = ret;
1195                         pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1196                                  thread_data, thread_data[t].ctlfd_pos,
1197                                  thread_data[t].pipes.msg[0]);
1198                 } else {
1199                         thread_data[t].tid = gettid();
1200
1201                         ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1202                         if (ret < 0)
1203                                 goto out_free;
1204
1205                         thread_data[t].ctlfd_pos = -1; /* Not used */
1206                 }
1207         }
1208
1209         return 0;
1210
1211 out_free:
1212         record__free_thread_data(rec);
1213
1214         return ret;
1215 }
1216
1217 static int record__mmap_evlist(struct record *rec,
1218                                struct evlist *evlist)
1219 {
1220         int i, ret;
1221         struct record_opts *opts = &rec->opts;
1222         bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1223                                   opts->auxtrace_sample_mode;
1224         char msg[512];
1225
1226         if (opts->affinity != PERF_AFFINITY_SYS)
1227                 cpu__setup_cpunode_map();
1228
1229         if (evlist__mmap_ex(evlist, opts->mmap_pages,
1230                                  opts->auxtrace_mmap_pages,
1231                                  auxtrace_overwrite,
1232                                  opts->nr_cblocks, opts->affinity,
1233                                  opts->mmap_flush, opts->comp_level) < 0) {
1234                 if (errno == EPERM) {
1235                         pr_err("Permission error mapping pages.\n"
1236                                "Consider increasing "
1237                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
1238                                "or try again with a smaller value of -m/--mmap_pages.\n"
1239                                "(current value: %u,%u)\n",
1240                                opts->mmap_pages, opts->auxtrace_mmap_pages);
1241                         return -errno;
1242                 } else {
1243                         pr_err("failed to mmap with %d (%s)\n", errno,
1244                                 str_error_r(errno, msg, sizeof(msg)));
1245                         if (errno)
1246                                 return -errno;
1247                         else
1248                                 return -EINVAL;
1249                 }
1250         }
1251
1252         if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1253                 return -1;
1254
1255         ret = record__alloc_thread_data(rec, evlist);
1256         if (ret)
1257                 return ret;
1258
1259         if (record__threads_enabled(rec)) {
1260                 ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1261                 if (ret) {
1262                         pr_err("Failed to create data directory: %s\n", strerror(-ret));
1263                         return ret;
1264                 }
1265                 for (i = 0; i < evlist->core.nr_mmaps; i++) {
1266                         if (evlist->mmap)
1267                                 evlist->mmap[i].file = &rec->data.dir.files[i];
1268                         if (evlist->overwrite_mmap)
1269                                 evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1270                 }
1271         }
1272
1273         return 0;
1274 }
1275
1276 static int record__mmap(struct record *rec)
1277 {
1278         return record__mmap_evlist(rec, rec->evlist);
1279 }
1280
1281 static int record__open(struct record *rec)
1282 {
1283         char msg[BUFSIZ];
1284         struct evsel *pos;
1285         struct evlist *evlist = rec->evlist;
1286         struct perf_session *session = rec->session;
1287         struct record_opts *opts = &rec->opts;
1288         int rc = 0;
1289
1290         /*
1291          * For initial_delay, system wide or a hybrid system, we need to add a
1292          * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
1293          * of waiting or event synthesis.
1294          */
1295         if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
1296             perf_pmu__has_hybrid()) {
1297                 pos = evlist__get_tracking_event(evlist);
1298                 if (!evsel__is_dummy_event(pos)) {
1299                         /* Set up dummy event. */
1300                         if (evlist__add_dummy(evlist))
1301                                 return -ENOMEM;
1302                         pos = evlist__last(evlist);
1303                         evlist__set_tracking_event(evlist, pos);
1304                 }
1305
1306                 /*
1307                  * Enable the dummy event when the process is forked for
1308                  * initial_delay, immediately for system wide.
1309                  */
1310                 if (opts->target.initial_delay && !pos->immediate &&
1311                     !target__has_cpu(&opts->target))
1312                         pos->core.attr.enable_on_exec = 1;
1313                 else
1314                         pos->immediate = 1;
1315         }
1316
1317         evlist__config(evlist, opts, &callchain_param);
1318
1319         evlist__for_each_entry(evlist, pos) {
1320 try_again:
1321                 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1322                         if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1323                                 if (verbose > 0)
1324                                         ui__warning("%s\n", msg);
1325                                 goto try_again;
1326                         }
1327                         if ((errno == EINVAL || errno == EBADF) &&
1328                             pos->core.leader != &pos->core &&
1329                             pos->weak_group) {
1330                                 pos = evlist__reset_weak_group(evlist, pos, true);
1331                                 goto try_again;
1332                         }
1333                         rc = -errno;
1334                         evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1335                         ui__error("%s\n", msg);
1336                         goto out;
1337                 }
1338
1339                 pos->supported = true;
1340         }
1341
1342         if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1343                 pr_warning(
1344 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1345 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1346 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1347 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1348 "Samples in kernel modules won't be resolved at all.\n\n"
1349 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1350 "even with a suitable vmlinux or kallsyms file.\n\n");
1351         }
1352
1353         if (evlist__apply_filters(evlist, &pos)) {
1354                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1355                         pos->filter, evsel__name(pos), errno,
1356                         str_error_r(errno, msg, sizeof(msg)));
1357                 rc = -1;
1358                 goto out;
1359         }
1360
1361         rc = record__mmap(rec);
1362         if (rc)
1363                 goto out;
1364
1365         session->evlist = evlist;
1366         perf_session__set_id_hdr_size(session);
1367 out:
1368         return rc;
1369 }
1370
1371 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1372 {
1373         if (rec->evlist->first_sample_time == 0)
1374                 rec->evlist->first_sample_time = sample_time;
1375
1376         if (sample_time)
1377                 rec->evlist->last_sample_time = sample_time;
1378 }
1379
1380 static int process_sample_event(struct perf_tool *tool,
1381                                 union perf_event *event,
1382                                 struct perf_sample *sample,
1383                                 struct evsel *evsel,
1384                                 struct machine *machine)
1385 {
1386         struct record *rec = container_of(tool, struct record, tool);
1387
1388         set_timestamp_boundary(rec, sample->time);
1389
1390         if (rec->buildid_all)
1391                 return 0;
1392
1393         rec->samples++;
1394         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1395 }
1396
1397 static int process_buildids(struct record *rec)
1398 {
1399         struct perf_session *session = rec->session;
1400
1401         if (perf_data__size(&rec->data) == 0)
1402                 return 0;
1403
1404         /*
1405          * During this process, it'll load kernel map and replace the
1406          * dso->long_name to a real pathname it found.  In this case
1407          * we prefer the vmlinux path like
1408          *   /lib/modules/3.16.4/build/vmlinux
1409          *
1410          * rather than build-id path (in debug directory).
1411          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1412          */
1413         symbol_conf.ignore_vmlinux_buildid = true;
1414
1415         /*
1416          * If --buildid-all is given, it marks all DSO regardless of hits,
1417          * so no need to process samples. But if timestamp_boundary is enabled,
1418          * it still needs to walk on all samples to get the timestamps of
1419          * first/last samples.
1420          */
1421         if (rec->buildid_all && !rec->timestamp_boundary)
1422                 rec->tool.sample = NULL;
1423
1424         return perf_session__process_events(session);
1425 }
1426
1427 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1428 {
1429         int err;
1430         struct perf_tool *tool = data;
1431         /*
1432          *As for guest kernel when processing subcommand record&report,
1433          *we arrange module mmap prior to guest kernel mmap and trigger
1434          *a preload dso because default guest module symbols are loaded
1435          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1436          *method is used to avoid symbol missing when the first addr is
1437          *in module instead of in guest kernel.
1438          */
1439         err = perf_event__synthesize_modules(tool, process_synthesized_event,
1440                                              machine);
1441         if (err < 0)
1442                 pr_err("Couldn't record guest kernel [%d]'s reference"
1443                        " relocation symbol.\n", machine->pid);
1444
1445         /*
1446          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1447          * have no _text sometimes.
1448          */
1449         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1450                                                  machine);
1451         if (err < 0)
1452                 pr_err("Couldn't record guest kernel [%d]'s reference"
1453                        " relocation symbol.\n", machine->pid);
1454 }
1455
1456 static struct perf_event_header finished_round_event = {
1457         .size = sizeof(struct perf_event_header),
1458         .type = PERF_RECORD_FINISHED_ROUND,
1459 };
1460
1461 static struct perf_event_header finished_init_event = {
1462         .size = sizeof(struct perf_event_header),
1463         .type = PERF_RECORD_FINISHED_INIT,
1464 };
1465
1466 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1467 {
1468         if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1469             !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1470                           thread->mask->affinity.nbits)) {
1471                 bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1472                 bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1473                           map->affinity_mask.bits, thread->mask->affinity.nbits);
1474                 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1475                                         (cpu_set_t *)thread->mask->affinity.bits);
1476                 if (verbose == 2) {
1477                         pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1478                         mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1479                 }
1480         }
1481 }
1482
1483 static size_t process_comp_header(void *record, size_t increment)
1484 {
1485         struct perf_record_compressed *event = record;
1486         size_t size = sizeof(*event);
1487
1488         if (increment) {
1489                 event->header.size += increment;
1490                 return increment;
1491         }
1492
1493         event->header.type = PERF_RECORD_COMPRESSED;
1494         event->header.size = size;
1495
1496         return size;
1497 }
1498
1499 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1500                             void *dst, size_t dst_size, void *src, size_t src_size)
1501 {
1502         size_t compressed;
1503         size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1504         struct zstd_data *zstd_data = &session->zstd_data;
1505
1506         if (map && map->file)
1507                 zstd_data = &map->zstd_data;
1508
1509         compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1510                                                      max_record_size, process_comp_header);
1511
1512         if (map && map->file) {
1513                 thread->bytes_transferred += src_size;
1514                 thread->bytes_compressed  += compressed;
1515         } else {
1516                 session->bytes_transferred += src_size;
1517                 session->bytes_compressed  += compressed;
1518         }
1519
1520         return compressed;
1521 }
1522
1523 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1524                                     bool overwrite, bool synch)
1525 {
1526         u64 bytes_written = rec->bytes_written;
1527         int i;
1528         int rc = 0;
1529         int nr_mmaps;
1530         struct mmap **maps;
1531         int trace_fd = rec->data.file.fd;
1532         off_t off = 0;
1533
1534         if (!evlist)
1535                 return 0;
1536
1537         nr_mmaps = thread->nr_mmaps;
1538         maps = overwrite ? thread->overwrite_maps : thread->maps;
1539
1540         if (!maps)
1541                 return 0;
1542
1543         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1544                 return 0;
1545
1546         if (record__aio_enabled(rec))
1547                 off = record__aio_get_pos(trace_fd);
1548
1549         for (i = 0; i < nr_mmaps; i++) {
1550                 u64 flush = 0;
1551                 struct mmap *map = maps[i];
1552
1553                 if (map->core.base) {
1554                         record__adjust_affinity(rec, map);
1555                         if (synch) {
1556                                 flush = map->core.flush;
1557                                 map->core.flush = 1;
1558                         }
1559                         if (!record__aio_enabled(rec)) {
1560                                 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1561                                         if (synch)
1562                                                 map->core.flush = flush;
1563                                         rc = -1;
1564                                         goto out;
1565                                 }
1566                         } else {
1567                                 if (record__aio_push(rec, map, &off) < 0) {
1568                                         record__aio_set_pos(trace_fd, off);
1569                                         if (synch)
1570                                                 map->core.flush = flush;
1571                                         rc = -1;
1572                                         goto out;
1573                                 }
1574                         }
1575                         if (synch)
1576                                 map->core.flush = flush;
1577                 }
1578
1579                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1580                     !rec->opts.auxtrace_sample_mode &&
1581                     record__auxtrace_mmap_read(rec, map) != 0) {
1582                         rc = -1;
1583                         goto out;
1584                 }
1585         }
1586
1587         if (record__aio_enabled(rec))
1588                 record__aio_set_pos(trace_fd, off);
1589
1590         /*
1591          * Mark the round finished in case we wrote
1592          * at least one event.
1593          *
1594          * No need for round events in directory mode,
1595          * because per-cpu maps and files have data
1596          * sorted by kernel.
1597          */
1598         if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1599                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1600
1601         if (overwrite)
1602                 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1603 out:
1604         return rc;
1605 }
1606
1607 static int record__mmap_read_all(struct record *rec, bool synch)
1608 {
1609         int err;
1610
1611         err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1612         if (err)
1613                 return err;
1614
1615         return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1616 }
1617
1618 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1619                                            void *arg __maybe_unused)
1620 {
1621         struct perf_mmap *map = fda->priv[fd].ptr;
1622
1623         if (map)
1624                 perf_mmap__put(map);
1625 }
1626
1627 static void *record__thread(void *arg)
1628 {
1629         enum thread_msg msg = THREAD_MSG__READY;
1630         bool terminate = false;
1631         struct fdarray *pollfd;
1632         int err, ctlfd_pos;
1633
1634         thread = arg;
1635         thread->tid = gettid();
1636
1637         err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1638         if (err == -1)
1639                 pr_warning("threads[%d]: failed to notify on start: %s\n",
1640                            thread->tid, strerror(errno));
1641
1642         pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1643
1644         pollfd = &thread->pollfd;
1645         ctlfd_pos = thread->ctlfd_pos;
1646
1647         for (;;) {
1648                 unsigned long long hits = thread->samples;
1649
1650                 if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1651                         break;
1652
1653                 if (hits == thread->samples) {
1654
1655                         err = fdarray__poll(pollfd, -1);
1656                         /*
1657                          * Propagate error, only if there's any. Ignore positive
1658                          * number of returned events and interrupt error.
1659                          */
1660                         if (err > 0 || (err < 0 && errno == EINTR))
1661                                 err = 0;
1662                         thread->waking++;
1663
1664                         if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1665                                             record__thread_munmap_filtered, NULL) == 0)
1666                                 break;
1667                 }
1668
1669                 if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1670                         terminate = true;
1671                         close(thread->pipes.msg[0]);
1672                         thread->pipes.msg[0] = -1;
1673                         pollfd->entries[ctlfd_pos].fd = -1;
1674                         pollfd->entries[ctlfd_pos].events = 0;
1675                 }
1676
1677                 pollfd->entries[ctlfd_pos].revents = 0;
1678         }
1679         record__mmap_read_all(thread->rec, true);
1680
1681         err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1682         if (err == -1)
1683                 pr_warning("threads[%d]: failed to notify on termination: %s\n",
1684                            thread->tid, strerror(errno));
1685
1686         return NULL;
1687 }
1688
1689 static void record__init_features(struct record *rec)
1690 {
1691         struct perf_session *session = rec->session;
1692         int feat;
1693
1694         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1695                 perf_header__set_feat(&session->header, feat);
1696
1697         if (rec->no_buildid)
1698                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1699
1700 #ifdef HAVE_LIBTRACEEVENT
1701         if (!have_tracepoints(&rec->evlist->core.entries))
1702                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1703 #endif
1704
1705         if (!rec->opts.branch_stack)
1706                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1707
1708         if (!rec->opts.full_auxtrace)
1709                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1710
1711         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1712                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1713
1714         if (!rec->opts.use_clockid)
1715                 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1716
1717         if (!record__threads_enabled(rec))
1718                 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1719
1720         if (!record__comp_enabled(rec))
1721                 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1722
1723         perf_header__clear_feat(&session->header, HEADER_STAT);
1724 }
1725
1726 static void
1727 record__finish_output(struct record *rec)
1728 {
1729         int i;
1730         struct perf_data *data = &rec->data;
1731         int fd = perf_data__fd(data);
1732
1733         if (data->is_pipe)
1734                 return;
1735
1736         rec->session->header.data_size += rec->bytes_written;
1737         data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1738         if (record__threads_enabled(rec)) {
1739                 for (i = 0; i < data->dir.nr; i++)
1740                         data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1741         }
1742
1743         if (!rec->no_buildid) {
1744                 process_buildids(rec);
1745
1746                 if (rec->buildid_all)
1747                         dsos__hit_all(rec->session);
1748         }
1749         perf_session__write_header(rec->session, rec->evlist, fd, true);
1750
1751         return;
1752 }
1753
1754 static int record__synthesize_workload(struct record *rec, bool tail)
1755 {
1756         int err;
1757         struct perf_thread_map *thread_map;
1758         bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1759
1760         if (rec->opts.tail_synthesize != tail)
1761                 return 0;
1762
1763         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1764         if (thread_map == NULL)
1765                 return -1;
1766
1767         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1768                                                  process_synthesized_event,
1769                                                  &rec->session->machines.host,
1770                                                  needs_mmap,
1771                                                  rec->opts.sample_address);
1772         perf_thread_map__put(thread_map);
1773         return err;
1774 }
1775
1776 static int write_finished_init(struct record *rec, bool tail)
1777 {
1778         if (rec->opts.tail_synthesize != tail)
1779                 return 0;
1780
1781         return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1782 }
1783
1784 static int record__synthesize(struct record *rec, bool tail);
1785
1786 static int
1787 record__switch_output(struct record *rec, bool at_exit)
1788 {
1789         struct perf_data *data = &rec->data;
1790         int fd, err;
1791         char *new_filename;
1792
1793         /* Same Size:      "2015122520103046"*/
1794         char timestamp[] = "InvalidTimestamp";
1795
1796         record__aio_mmap_read_sync(rec);
1797
1798         write_finished_init(rec, true);
1799
1800         record__synthesize(rec, true);
1801         if (target__none(&rec->opts.target))
1802                 record__synthesize_workload(rec, true);
1803
1804         rec->samples = 0;
1805         record__finish_output(rec);
1806         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1807         if (err) {
1808                 pr_err("Failed to get current timestamp\n");
1809                 return -EINVAL;
1810         }
1811
1812         fd = perf_data__switch(data, timestamp,
1813                                     rec->session->header.data_offset,
1814                                     at_exit, &new_filename);
1815         if (fd >= 0 && !at_exit) {
1816                 rec->bytes_written = 0;
1817                 rec->session->header.data_size = 0;
1818         }
1819
1820         if (!quiet)
1821                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1822                         data->path, timestamp);
1823
1824         if (rec->switch_output.num_files) {
1825                 int n = rec->switch_output.cur_file + 1;
1826
1827                 if (n >= rec->switch_output.num_files)
1828                         n = 0;
1829                 rec->switch_output.cur_file = n;
1830                 if (rec->switch_output.filenames[n]) {
1831                         remove(rec->switch_output.filenames[n]);
1832                         zfree(&rec->switch_output.filenames[n]);
1833                 }
1834                 rec->switch_output.filenames[n] = new_filename;
1835         } else {
1836                 free(new_filename);
1837         }
1838
1839         /* Output tracking events */
1840         if (!at_exit) {
1841                 record__synthesize(rec, false);
1842
1843                 /*
1844                  * In 'perf record --switch-output' without -a,
1845                  * record__synthesize() in record__switch_output() won't
1846                  * generate tracking events because there's no thread_map
1847                  * in evlist. Which causes newly created perf.data doesn't
1848                  * contain map and comm information.
1849                  * Create a fake thread_map and directly call
1850                  * perf_event__synthesize_thread_map() for those events.
1851                  */
1852                 if (target__none(&rec->opts.target))
1853                         record__synthesize_workload(rec, false);
1854                 write_finished_init(rec, false);
1855         }
1856         return fd;
1857 }
1858
1859 static void __record__read_lost_samples(struct record *rec, struct evsel *evsel,
1860                                         struct perf_record_lost_samples *lost,
1861                                         int cpu_idx, int thread_idx)
1862 {
1863         struct perf_counts_values count;
1864         struct perf_sample_id *sid;
1865         struct perf_sample sample = {};
1866         int id_hdr_size;
1867
1868         if (perf_evsel__read(&evsel->core, cpu_idx, thread_idx, &count) < 0) {
1869                 pr_debug("read LOST count failed\n");
1870                 return;
1871         }
1872
1873         if (count.lost == 0)
1874                 return;
1875
1876         lost->lost = count.lost;
1877         if (evsel->core.ids) {
1878                 sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1879                 sample.id = sid->id;
1880         }
1881
1882         id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1883                                                        evsel->core.attr.sample_type, &sample);
1884         lost->header.size = sizeof(*lost) + id_hdr_size;
1885         record__write(rec, NULL, lost, lost->header.size);
1886 }
1887
1888 static void record__read_lost_samples(struct record *rec)
1889 {
1890         struct perf_session *session = rec->session;
1891         struct perf_record_lost_samples *lost;
1892         struct evsel *evsel;
1893
1894         /* there was an error during record__open */
1895         if (session->evlist == NULL)
1896                 return;
1897
1898         lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1899         if (lost == NULL) {
1900                 pr_debug("Memory allocation failed\n");
1901                 return;
1902         }
1903
1904         lost->header.type = PERF_RECORD_LOST_SAMPLES;
1905
1906         evlist__for_each_entry(session->evlist, evsel) {
1907                 struct xyarray *xy = evsel->core.sample_id;
1908
1909                 if (xy == NULL || evsel->core.fd == NULL)
1910                         continue;
1911                 if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1912                     xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1913                         pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1914                         continue;
1915                 }
1916
1917                 for (int x = 0; x < xyarray__max_x(xy); x++) {
1918                         for (int y = 0; y < xyarray__max_y(xy); y++) {
1919                                 __record__read_lost_samples(rec, evsel, lost, x, y);
1920                         }
1921                 }
1922         }
1923         free(lost);
1924
1925 }
1926
1927 static volatile sig_atomic_t workload_exec_errno;
1928
1929 /*
1930  * evlist__prepare_workload will send a SIGUSR1
1931  * if the fork fails, since we asked by setting its
1932  * want_signal to true.
1933  */
1934 static void workload_exec_failed_signal(int signo __maybe_unused,
1935                                         siginfo_t *info,
1936                                         void *ucontext __maybe_unused)
1937 {
1938         workload_exec_errno = info->si_value.sival_int;
1939         done = 1;
1940         child_finished = 1;
1941 }
1942
1943 static void snapshot_sig_handler(int sig);
1944 static void alarm_sig_handler(int sig);
1945
1946 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1947 {
1948         if (evlist) {
1949                 if (evlist->mmap && evlist->mmap[0].core.base)
1950                         return evlist->mmap[0].core.base;
1951                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1952                         return evlist->overwrite_mmap[0].core.base;
1953         }
1954         return NULL;
1955 }
1956
1957 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1958 {
1959         const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1960         if (pc)
1961                 return pc;
1962         return NULL;
1963 }
1964
1965 static int record__synthesize(struct record *rec, bool tail)
1966 {
1967         struct perf_session *session = rec->session;
1968         struct machine *machine = &session->machines.host;
1969         struct perf_data *data = &rec->data;
1970         struct record_opts *opts = &rec->opts;
1971         struct perf_tool *tool = &rec->tool;
1972         int err = 0;
1973         event_op f = process_synthesized_event;
1974
1975         if (rec->opts.tail_synthesize != tail)
1976                 return 0;
1977
1978         if (data->is_pipe) {
1979                 err = perf_event__synthesize_for_pipe(tool, session, data,
1980                                                       process_synthesized_event);
1981                 if (err < 0)
1982                         goto out;
1983
1984                 rec->bytes_written += err;
1985         }
1986
1987         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1988                                           process_synthesized_event, machine);
1989         if (err)
1990                 goto out;
1991
1992         /* Synthesize id_index before auxtrace_info */
1993         err = perf_event__synthesize_id_index(tool,
1994                                               process_synthesized_event,
1995                                               session->evlist, machine);
1996         if (err)
1997                 goto out;
1998
1999         if (rec->opts.full_auxtrace) {
2000                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2001                                         session, process_synthesized_event);
2002                 if (err)
2003                         goto out;
2004         }
2005
2006         if (!evlist__exclude_kernel(rec->evlist)) {
2007                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2008                                                          machine);
2009                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2010                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2011                                    "Check /proc/kallsyms permission or run as root.\n");
2012
2013                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
2014                                                      machine);
2015                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2016                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2017                                    "Check /proc/modules permission or run as root.\n");
2018         }
2019
2020         if (perf_guest) {
2021                 machines__process_guests(&session->machines,
2022                                          perf_event__synthesize_guest_os, tool);
2023         }
2024
2025         err = perf_event__synthesize_extra_attr(&rec->tool,
2026                                                 rec->evlist,
2027                                                 process_synthesized_event,
2028                                                 data->is_pipe);
2029         if (err)
2030                 goto out;
2031
2032         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2033                                                  process_synthesized_event,
2034                                                 NULL);
2035         if (err < 0) {
2036                 pr_err("Couldn't synthesize thread map.\n");
2037                 return err;
2038         }
2039
2040         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2041                                              process_synthesized_event, NULL);
2042         if (err < 0) {
2043                 pr_err("Couldn't synthesize cpu map.\n");
2044                 return err;
2045         }
2046
2047         err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2048                                                 machine, opts);
2049         if (err < 0) {
2050                 pr_warning("Couldn't synthesize bpf events.\n");
2051                 err = 0;
2052         }
2053
2054         if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2055                 err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2056                                                      machine);
2057                 if (err < 0) {
2058                         pr_warning("Couldn't synthesize cgroup events.\n");
2059                         err = 0;
2060                 }
2061         }
2062
2063         if (rec->opts.nr_threads_synthesize > 1) {
2064                 mutex_init(&synth_lock);
2065                 perf_set_multithreaded();
2066                 f = process_locked_synthesized_event;
2067         }
2068
2069         if (rec->opts.synth & PERF_SYNTH_TASK) {
2070                 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2071
2072                 err = __machine__synthesize_threads(machine, tool, &opts->target,
2073                                                     rec->evlist->core.threads,
2074                                                     f, needs_mmap, opts->sample_address,
2075                                                     rec->opts.nr_threads_synthesize);
2076         }
2077
2078         if (rec->opts.nr_threads_synthesize > 1) {
2079                 perf_set_singlethreaded();
2080                 mutex_destroy(&synth_lock);
2081         }
2082
2083 out:
2084         return err;
2085 }
2086
2087 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2088 {
2089         struct record *rec = data;
2090         pthread_kill(rec->thread_id, SIGUSR2);
2091         return 0;
2092 }
2093
2094 static int record__setup_sb_evlist(struct record *rec)
2095 {
2096         struct record_opts *opts = &rec->opts;
2097
2098         if (rec->sb_evlist != NULL) {
2099                 /*
2100                  * We get here if --switch-output-event populated the
2101                  * sb_evlist, so associate a callback that will send a SIGUSR2
2102                  * to the main thread.
2103                  */
2104                 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2105                 rec->thread_id = pthread_self();
2106         }
2107 #ifdef HAVE_LIBBPF_SUPPORT
2108         if (!opts->no_bpf_event) {
2109                 if (rec->sb_evlist == NULL) {
2110                         rec->sb_evlist = evlist__new();
2111
2112                         if (rec->sb_evlist == NULL) {
2113                                 pr_err("Couldn't create side band evlist.\n.");
2114                                 return -1;
2115                         }
2116                 }
2117
2118                 if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2119                         pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2120                         return -1;
2121                 }
2122         }
2123 #endif
2124         if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2125                 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2126                 opts->no_bpf_event = true;
2127         }
2128
2129         return 0;
2130 }
2131
2132 static int record__init_clock(struct record *rec)
2133 {
2134         struct perf_session *session = rec->session;
2135         struct timespec ref_clockid;
2136         struct timeval ref_tod;
2137         u64 ref;
2138
2139         if (!rec->opts.use_clockid)
2140                 return 0;
2141
2142         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2143                 session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2144
2145         session->header.env.clock.clockid = rec->opts.clockid;
2146
2147         if (gettimeofday(&ref_tod, NULL) != 0) {
2148                 pr_err("gettimeofday failed, cannot set reference time.\n");
2149                 return -1;
2150         }
2151
2152         if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2153                 pr_err("clock_gettime failed, cannot set reference time.\n");
2154                 return -1;
2155         }
2156
2157         ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2158               (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2159
2160         session->header.env.clock.tod_ns = ref;
2161
2162         ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2163               (u64) ref_clockid.tv_nsec;
2164
2165         session->header.env.clock.clockid_ns = ref;
2166         return 0;
2167 }
2168
2169 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2170 {
2171         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2172                 trigger_hit(&auxtrace_snapshot_trigger);
2173                 auxtrace_record__snapshot_started = 1;
2174                 if (auxtrace_record__snapshot_start(rec->itr))
2175                         trigger_error(&auxtrace_snapshot_trigger);
2176         }
2177 }
2178
2179 static void record__uniquify_name(struct record *rec)
2180 {
2181         struct evsel *pos;
2182         struct evlist *evlist = rec->evlist;
2183         char *new_name;
2184         int ret;
2185
2186         if (!perf_pmu__has_hybrid())
2187                 return;
2188
2189         evlist__for_each_entry(evlist, pos) {
2190                 if (!evsel__is_hybrid(pos))
2191                         continue;
2192
2193                 if (strchr(pos->name, '/'))
2194                         continue;
2195
2196                 ret = asprintf(&new_name, "%s/%s/",
2197                                pos->pmu_name, pos->name);
2198                 if (ret) {
2199                         free(pos->name);
2200                         pos->name = new_name;
2201                 }
2202         }
2203 }
2204
2205 static int record__terminate_thread(struct record_thread *thread_data)
2206 {
2207         int err;
2208         enum thread_msg ack = THREAD_MSG__UNDEFINED;
2209         pid_t tid = thread_data->tid;
2210
2211         close(thread_data->pipes.msg[1]);
2212         thread_data->pipes.msg[1] = -1;
2213         err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2214         if (err > 0)
2215                 pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2216         else
2217                 pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2218                            thread->tid, tid);
2219
2220         return 0;
2221 }
2222
2223 static int record__start_threads(struct record *rec)
2224 {
2225         int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2226         struct record_thread *thread_data = rec->thread_data;
2227         sigset_t full, mask;
2228         pthread_t handle;
2229         pthread_attr_t attrs;
2230
2231         thread = &thread_data[0];
2232
2233         if (!record__threads_enabled(rec))
2234                 return 0;
2235
2236         sigfillset(&full);
2237         if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2238                 pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2239                 return -1;
2240         }
2241
2242         pthread_attr_init(&attrs);
2243         pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2244
2245         for (t = 1; t < nr_threads; t++) {
2246                 enum thread_msg msg = THREAD_MSG__UNDEFINED;
2247
2248 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2249                 pthread_attr_setaffinity_np(&attrs,
2250                                             MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2251                                             (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2252 #endif
2253                 if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2254                         for (tt = 1; tt < t; tt++)
2255                                 record__terminate_thread(&thread_data[t]);
2256                         pr_err("Failed to start threads: %s\n", strerror(errno));
2257                         ret = -1;
2258                         goto out_err;
2259                 }
2260
2261                 err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2262                 if (err > 0)
2263                         pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2264                                   thread_msg_tags[msg]);
2265                 else
2266                         pr_warning("threads[%d]: failed to receive start notification from %d\n",
2267                                    thread->tid, rec->thread_data[t].tid);
2268         }
2269
2270         sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2271                         (cpu_set_t *)thread->mask->affinity.bits);
2272
2273         pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2274
2275 out_err:
2276         pthread_attr_destroy(&attrs);
2277
2278         if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2279                 pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2280                 ret = -1;
2281         }
2282
2283         return ret;
2284 }
2285
2286 static int record__stop_threads(struct record *rec)
2287 {
2288         int t;
2289         struct record_thread *thread_data = rec->thread_data;
2290
2291         for (t = 1; t < rec->nr_threads; t++)
2292                 record__terminate_thread(&thread_data[t]);
2293
2294         for (t = 0; t < rec->nr_threads; t++) {
2295                 rec->samples += thread_data[t].samples;
2296                 if (!record__threads_enabled(rec))
2297                         continue;
2298                 rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2299                 rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2300                 pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2301                          thread_data[t].samples, thread_data[t].waking);
2302                 if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2303                         pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2304                                  thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2305                 else
2306                         pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2307         }
2308
2309         return 0;
2310 }
2311
2312 static unsigned long record__waking(struct record *rec)
2313 {
2314         int t;
2315         unsigned long waking = 0;
2316         struct record_thread *thread_data = rec->thread_data;
2317
2318         for (t = 0; t < rec->nr_threads; t++)
2319                 waking += thread_data[t].waking;
2320
2321         return waking;
2322 }
2323
2324 static int __cmd_record(struct record *rec, int argc, const char **argv)
2325 {
2326         int err;
2327         int status = 0;
2328         const bool forks = argc > 0;
2329         struct perf_tool *tool = &rec->tool;
2330         struct record_opts *opts = &rec->opts;
2331         struct perf_data *data = &rec->data;
2332         struct perf_session *session;
2333         bool disabled = false, draining = false;
2334         int fd;
2335         float ratio = 0;
2336         enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2337
2338         atexit(record__sig_exit);
2339         signal(SIGCHLD, sig_handler);
2340         signal(SIGINT, sig_handler);
2341         signal(SIGTERM, sig_handler);
2342         signal(SIGSEGV, sigsegv_handler);
2343
2344         if (rec->opts.record_namespaces)
2345                 tool->namespace_events = true;
2346
2347         if (rec->opts.record_cgroup) {
2348 #ifdef HAVE_FILE_HANDLE
2349                 tool->cgroup_events = true;
2350 #else
2351                 pr_err("cgroup tracking is not supported\n");
2352                 return -1;
2353 #endif
2354         }
2355
2356         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2357                 signal(SIGUSR2, snapshot_sig_handler);
2358                 if (rec->opts.auxtrace_snapshot_mode)
2359                         trigger_on(&auxtrace_snapshot_trigger);
2360                 if (rec->switch_output.enabled)
2361                         trigger_on(&switch_output_trigger);
2362         } else {
2363                 signal(SIGUSR2, SIG_IGN);
2364         }
2365
2366         session = perf_session__new(data, tool);
2367         if (IS_ERR(session)) {
2368                 pr_err("Perf session creation failed.\n");
2369                 return PTR_ERR(session);
2370         }
2371
2372         if (record__threads_enabled(rec)) {
2373                 if (perf_data__is_pipe(&rec->data)) {
2374                         pr_err("Parallel trace streaming is not available in pipe mode.\n");
2375                         return -1;
2376                 }
2377                 if (rec->opts.full_auxtrace) {
2378                         pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2379                         return -1;
2380                 }
2381         }
2382
2383         fd = perf_data__fd(data);
2384         rec->session = session;
2385
2386         if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2387                 pr_err("Compression initialization failed.\n");
2388                 return -1;
2389         }
2390 #ifdef HAVE_EVENTFD_SUPPORT
2391         done_fd = eventfd(0, EFD_NONBLOCK);
2392         if (done_fd < 0) {
2393                 pr_err("Failed to create wakeup eventfd, error: %m\n");
2394                 status = -1;
2395                 goto out_delete_session;
2396         }
2397         err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2398         if (err < 0) {
2399                 pr_err("Failed to add wakeup eventfd to poll list\n");
2400                 status = err;
2401                 goto out_delete_session;
2402         }
2403 #endif // HAVE_EVENTFD_SUPPORT
2404
2405         session->header.env.comp_type  = PERF_COMP_ZSTD;
2406         session->header.env.comp_level = rec->opts.comp_level;
2407
2408         if (rec->opts.kcore &&
2409             !record__kcore_readable(&session->machines.host)) {
2410                 pr_err("ERROR: kcore is not readable.\n");
2411                 return -1;
2412         }
2413
2414         if (record__init_clock(rec))
2415                 return -1;
2416
2417         record__init_features(rec);
2418
2419         if (forks) {
2420                 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2421                                                workload_exec_failed_signal);
2422                 if (err < 0) {
2423                         pr_err("Couldn't run the workload!\n");
2424                         status = err;
2425                         goto out_delete_session;
2426                 }
2427         }
2428
2429         /*
2430          * If we have just single event and are sending data
2431          * through pipe, we need to force the ids allocation,
2432          * because we synthesize event name through the pipe
2433          * and need the id for that.
2434          */
2435         if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2436                 rec->opts.sample_id = true;
2437
2438         record__uniquify_name(rec);
2439
2440         /* Debug message used by test scripts */
2441         pr_debug3("perf record opening and mmapping events\n");
2442         if (record__open(rec) != 0) {
2443                 err = -1;
2444                 goto out_free_threads;
2445         }
2446         /* Debug message used by test scripts */
2447         pr_debug3("perf record done opening and mmapping events\n");
2448         session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2449
2450         if (rec->opts.kcore) {
2451                 err = record__kcore_copy(&session->machines.host, data);
2452                 if (err) {
2453                         pr_err("ERROR: Failed to copy kcore\n");
2454                         goto out_free_threads;
2455                 }
2456         }
2457
2458         err = bpf__apply_obj_config();
2459         if (err) {
2460                 char errbuf[BUFSIZ];
2461
2462                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2463                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2464                          errbuf);
2465                 goto out_free_threads;
2466         }
2467
2468         /*
2469          * Normally perf_session__new would do this, but it doesn't have the
2470          * evlist.
2471          */
2472         if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2473                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2474                 rec->tool.ordered_events = false;
2475         }
2476
2477         if (evlist__nr_groups(rec->evlist) == 0)
2478                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2479
2480         if (data->is_pipe) {
2481                 err = perf_header__write_pipe(fd);
2482                 if (err < 0)
2483                         goto out_free_threads;
2484         } else {
2485                 err = perf_session__write_header(session, rec->evlist, fd, false);
2486                 if (err < 0)
2487                         goto out_free_threads;
2488         }
2489
2490         err = -1;
2491         if (!rec->no_buildid
2492             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2493                 pr_err("Couldn't generate buildids. "
2494                        "Use --no-buildid to profile anyway.\n");
2495                 goto out_free_threads;
2496         }
2497
2498         err = record__setup_sb_evlist(rec);
2499         if (err)
2500                 goto out_free_threads;
2501
2502         err = record__synthesize(rec, false);
2503         if (err < 0)
2504                 goto out_free_threads;
2505
2506         if (rec->realtime_prio) {
2507                 struct sched_param param;
2508
2509                 param.sched_priority = rec->realtime_prio;
2510                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2511                         pr_err("Could not set realtime priority.\n");
2512                         err = -1;
2513                         goto out_free_threads;
2514                 }
2515         }
2516
2517         if (record__start_threads(rec))
2518                 goto out_free_threads;
2519
2520         /*
2521          * When perf is starting the traced process, all the events
2522          * (apart from group members) have enable_on_exec=1 set,
2523          * so don't spoil it by prematurely enabling them.
2524          */
2525         if (!target__none(&opts->target) && !opts->target.initial_delay)
2526                 evlist__enable(rec->evlist);
2527
2528         /*
2529          * Let the child rip
2530          */
2531         if (forks) {
2532                 struct machine *machine = &session->machines.host;
2533                 union perf_event *event;
2534                 pid_t tgid;
2535
2536                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2537                 if (event == NULL) {
2538                         err = -ENOMEM;
2539                         goto out_child;
2540                 }
2541
2542                 /*
2543                  * Some H/W events are generated before COMM event
2544                  * which is emitted during exec(), so perf script
2545                  * cannot see a correct process name for those events.
2546                  * Synthesize COMM event to prevent it.
2547                  */
2548                 tgid = perf_event__synthesize_comm(tool, event,
2549                                                    rec->evlist->workload.pid,
2550                                                    process_synthesized_event,
2551                                                    machine);
2552                 free(event);
2553
2554                 if (tgid == -1)
2555                         goto out_child;
2556
2557                 event = malloc(sizeof(event->namespaces) +
2558                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2559                                machine->id_hdr_size);
2560                 if (event == NULL) {
2561                         err = -ENOMEM;
2562                         goto out_child;
2563                 }
2564
2565                 /*
2566                  * Synthesize NAMESPACES event for the command specified.
2567                  */
2568                 perf_event__synthesize_namespaces(tool, event,
2569                                                   rec->evlist->workload.pid,
2570                                                   tgid, process_synthesized_event,
2571                                                   machine);
2572                 free(event);
2573
2574                 evlist__start_workload(rec->evlist);
2575         }
2576
2577         if (opts->target.initial_delay) {
2578                 pr_info(EVLIST_DISABLED_MSG);
2579                 if (opts->target.initial_delay > 0) {
2580                         usleep(opts->target.initial_delay * USEC_PER_MSEC);
2581                         evlist__enable(rec->evlist);
2582                         pr_info(EVLIST_ENABLED_MSG);
2583                 }
2584         }
2585
2586         err = event_enable_timer__start(rec->evlist->eet);
2587         if (err)
2588                 goto out_child;
2589
2590         /* Debug message used by test scripts */
2591         pr_debug3("perf record has started\n");
2592         fflush(stderr);
2593
2594         trigger_ready(&auxtrace_snapshot_trigger);
2595         trigger_ready(&switch_output_trigger);
2596         perf_hooks__invoke_record_start();
2597
2598         /*
2599          * Must write FINISHED_INIT so it will be seen after all other
2600          * synthesized user events, but before any regular events.
2601          */
2602         err = write_finished_init(rec, false);
2603         if (err < 0)
2604                 goto out_child;
2605
2606         for (;;) {
2607                 unsigned long long hits = thread->samples;
2608
2609                 /*
2610                  * rec->evlist->bkw_mmap_state is possible to be
2611                  * BKW_MMAP_EMPTY here: when done == true and
2612                  * hits != rec->samples in previous round.
2613                  *
2614                  * evlist__toggle_bkw_mmap ensure we never
2615                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2616                  */
2617                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
2618                         evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2619
2620                 if (record__mmap_read_all(rec, false) < 0) {
2621                         trigger_error(&auxtrace_snapshot_trigger);
2622                         trigger_error(&switch_output_trigger);
2623                         err = -1;
2624                         goto out_child;
2625                 }
2626
2627                 if (auxtrace_record__snapshot_started) {
2628                         auxtrace_record__snapshot_started = 0;
2629                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
2630                                 record__read_auxtrace_snapshot(rec, false);
2631                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2632                                 pr_err("AUX area tracing snapshot failed\n");
2633                                 err = -1;
2634                                 goto out_child;
2635                         }
2636                 }
2637
2638                 if (trigger_is_hit(&switch_output_trigger)) {
2639                         /*
2640                          * If switch_output_trigger is hit, the data in
2641                          * overwritable ring buffer should have been collected,
2642                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2643                          *
2644                          * If SIGUSR2 raise after or during record__mmap_read_all(),
2645                          * record__mmap_read_all() didn't collect data from
2646                          * overwritable ring buffer. Read again.
2647                          */
2648                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2649                                 continue;
2650                         trigger_ready(&switch_output_trigger);
2651
2652                         /*
2653                          * Reenable events in overwrite ring buffer after
2654                          * record__mmap_read_all(): we should have collected
2655                          * data from it.
2656                          */
2657                         evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2658
2659                         if (!quiet)
2660                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2661                                         record__waking(rec));
2662                         thread->waking = 0;
2663                         fd = record__switch_output(rec, false);
2664                         if (fd < 0) {
2665                                 pr_err("Failed to switch to new file\n");
2666                                 trigger_error(&switch_output_trigger);
2667                                 err = fd;
2668                                 goto out_child;
2669                         }
2670
2671                         /* re-arm the alarm */
2672                         if (rec->switch_output.time)
2673                                 alarm(rec->switch_output.time);
2674                 }
2675
2676                 if (hits == thread->samples) {
2677                         if (done || draining)
2678                                 break;
2679                         err = fdarray__poll(&thread->pollfd, -1);
2680                         /*
2681                          * Propagate error, only if there's any. Ignore positive
2682                          * number of returned events and interrupt error.
2683                          */
2684                         if (err > 0 || (err < 0 && errno == EINTR))
2685                                 err = 0;
2686                         thread->waking++;
2687
2688                         if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2689                                             record__thread_munmap_filtered, NULL) == 0)
2690                                 draining = true;
2691
2692                         err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2693                         if (err)
2694                                 goto out_child;
2695                 }
2696
2697                 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2698                         switch (cmd) {
2699                         case EVLIST_CTL_CMD_SNAPSHOT:
2700                                 hit_auxtrace_snapshot_trigger(rec);
2701                                 evlist__ctlfd_ack(rec->evlist);
2702                                 break;
2703                         case EVLIST_CTL_CMD_STOP:
2704                                 done = 1;
2705                                 break;
2706                         case EVLIST_CTL_CMD_ACK:
2707                         case EVLIST_CTL_CMD_UNSUPPORTED:
2708                         case EVLIST_CTL_CMD_ENABLE:
2709                         case EVLIST_CTL_CMD_DISABLE:
2710                         case EVLIST_CTL_CMD_EVLIST:
2711                         case EVLIST_CTL_CMD_PING:
2712                         default:
2713                                 break;
2714                         }
2715                 }
2716
2717                 err = event_enable_timer__process(rec->evlist->eet);
2718                 if (err < 0)
2719                         goto out_child;
2720                 if (err) {
2721                         err = 0;
2722                         done = 1;
2723                 }
2724
2725                 /*
2726                  * When perf is starting the traced process, at the end events
2727                  * die with the process and we wait for that. Thus no need to
2728                  * disable events in this case.
2729                  */
2730                 if (done && !disabled && !target__none(&opts->target)) {
2731                         trigger_off(&auxtrace_snapshot_trigger);
2732                         evlist__disable(rec->evlist);
2733                         disabled = true;
2734                 }
2735         }
2736
2737         trigger_off(&auxtrace_snapshot_trigger);
2738         trigger_off(&switch_output_trigger);
2739
2740         if (opts->auxtrace_snapshot_on_exit)
2741                 record__auxtrace_snapshot_exit(rec);
2742
2743         if (forks && workload_exec_errno) {
2744                 char msg[STRERR_BUFSIZE], strevsels[2048];
2745                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2746
2747                 evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2748
2749                 pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2750                         strevsels, argv[0], emsg);
2751                 err = -1;
2752                 goto out_child;
2753         }
2754
2755         if (!quiet)
2756                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2757                         record__waking(rec));
2758
2759         write_finished_init(rec, true);
2760
2761         if (target__none(&rec->opts.target))
2762                 record__synthesize_workload(rec, true);
2763
2764 out_child:
2765         record__stop_threads(rec);
2766         record__mmap_read_all(rec, true);
2767 out_free_threads:
2768         record__free_thread_data(rec);
2769         evlist__finalize_ctlfd(rec->evlist);
2770         record__aio_mmap_read_sync(rec);
2771
2772         if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2773                 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2774                 session->header.env.comp_ratio = ratio + 0.5;
2775         }
2776
2777         if (forks) {
2778                 int exit_status;
2779
2780                 if (!child_finished)
2781                         kill(rec->evlist->workload.pid, SIGTERM);
2782
2783                 wait(&exit_status);
2784
2785                 if (err < 0)
2786                         status = err;
2787                 else if (WIFEXITED(exit_status))
2788                         status = WEXITSTATUS(exit_status);
2789                 else if (WIFSIGNALED(exit_status))
2790                         signr = WTERMSIG(exit_status);
2791         } else
2792                 status = err;
2793
2794         if (rec->off_cpu)
2795                 rec->bytes_written += off_cpu_write(rec->session);
2796
2797         record__read_lost_samples(rec);
2798         record__synthesize(rec, true);
2799         /* this will be recalculated during process_buildids() */
2800         rec->samples = 0;
2801
2802         if (!err) {
2803                 if (!rec->timestamp_filename) {
2804                         record__finish_output(rec);
2805                 } else {
2806                         fd = record__switch_output(rec, true);
2807                         if (fd < 0) {
2808                                 status = fd;
2809                                 goto out_delete_session;
2810                         }
2811                 }
2812         }
2813
2814         perf_hooks__invoke_record_end();
2815
2816         if (!err && !quiet) {
2817                 char samples[128];
2818                 const char *postfix = rec->timestamp_filename ?
2819                                         ".<timestamp>" : "";
2820
2821                 if (rec->samples && !rec->opts.full_auxtrace)
2822                         scnprintf(samples, sizeof(samples),
2823                                   " (%" PRIu64 " samples)", rec->samples);
2824                 else
2825                         samples[0] = '\0';
2826
2827                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
2828                         perf_data__size(data) / 1024.0 / 1024.0,
2829                         data->path, postfix, samples);
2830                 if (ratio) {
2831                         fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
2832                                         rec->session->bytes_transferred / 1024.0 / 1024.0,
2833                                         ratio);
2834                 }
2835                 fprintf(stderr, " ]\n");
2836         }
2837
2838 out_delete_session:
2839 #ifdef HAVE_EVENTFD_SUPPORT
2840         if (done_fd >= 0) {
2841                 fd = done_fd;
2842                 done_fd = -1;
2843
2844                 close(fd);
2845         }
2846 #endif
2847         zstd_fini(&session->zstd_data);
2848         perf_session__delete(session);
2849
2850         if (!opts->no_bpf_event)
2851                 evlist__stop_sb_thread(rec->sb_evlist);
2852         return status;
2853 }
2854
2855 static void callchain_debug(struct callchain_param *callchain)
2856 {
2857         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2858
2859         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2860
2861         if (callchain->record_mode == CALLCHAIN_DWARF)
2862                 pr_debug("callchain: stack dump size %d\n",
2863                          callchain->dump_size);
2864 }
2865
2866 int record_opts__parse_callchain(struct record_opts *record,
2867                                  struct callchain_param *callchain,
2868                                  const char *arg, bool unset)
2869 {
2870         int ret;
2871         callchain->enabled = !unset;
2872
2873         /* --no-call-graph */
2874         if (unset) {
2875                 callchain->record_mode = CALLCHAIN_NONE;
2876                 pr_debug("callchain: disabled\n");
2877                 return 0;
2878         }
2879
2880         ret = parse_callchain_record_opt(arg, callchain);
2881         if (!ret) {
2882                 /* Enable data address sampling for DWARF unwind. */
2883                 if (callchain->record_mode == CALLCHAIN_DWARF)
2884                         record->sample_address = true;
2885                 callchain_debug(callchain);
2886         }
2887
2888         return ret;
2889 }
2890
2891 int record_parse_callchain_opt(const struct option *opt,
2892                                const char *arg,
2893                                int unset)
2894 {
2895         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2896 }
2897
2898 int record_callchain_opt(const struct option *opt,
2899                          const char *arg __maybe_unused,
2900                          int unset __maybe_unused)
2901 {
2902         struct callchain_param *callchain = opt->value;
2903
2904         callchain->enabled = true;
2905
2906         if (callchain->record_mode == CALLCHAIN_NONE)
2907                 callchain->record_mode = CALLCHAIN_FP;
2908
2909         callchain_debug(callchain);
2910         return 0;
2911 }
2912
2913 static int perf_record_config(const char *var, const char *value, void *cb)
2914 {
2915         struct record *rec = cb;
2916
2917         if (!strcmp(var, "record.build-id")) {
2918                 if (!strcmp(value, "cache"))
2919                         rec->no_buildid_cache = false;
2920                 else if (!strcmp(value, "no-cache"))
2921                         rec->no_buildid_cache = true;
2922                 else if (!strcmp(value, "skip"))
2923                         rec->no_buildid = true;
2924                 else if (!strcmp(value, "mmap"))
2925                         rec->buildid_mmap = true;
2926                 else
2927                         return -1;
2928                 return 0;
2929         }
2930         if (!strcmp(var, "record.call-graph")) {
2931                 var = "call-graph.record-mode";
2932                 return perf_default_config(var, value, cb);
2933         }
2934 #ifdef HAVE_AIO_SUPPORT
2935         if (!strcmp(var, "record.aio")) {
2936                 rec->opts.nr_cblocks = strtol(value, NULL, 0);
2937                 if (!rec->opts.nr_cblocks)
2938                         rec->opts.nr_cblocks = nr_cblocks_default;
2939         }
2940 #endif
2941         if (!strcmp(var, "record.debuginfod")) {
2942                 rec->debuginfod.urls = strdup(value);
2943                 if (!rec->debuginfod.urls)
2944                         return -ENOMEM;
2945                 rec->debuginfod.set = true;
2946         }
2947
2948         return 0;
2949 }
2950
2951 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2952 {
2953         struct record *rec = (struct record *)opt->value;
2954
2955         return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2956 }
2957
2958 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2959 {
2960         struct record_opts *opts = (struct record_opts *)opt->value;
2961
2962         if (unset || !str)
2963                 return 0;
2964
2965         if (!strcasecmp(str, "node"))
2966                 opts->affinity = PERF_AFFINITY_NODE;
2967         else if (!strcasecmp(str, "cpu"))
2968                 opts->affinity = PERF_AFFINITY_CPU;
2969
2970         return 0;
2971 }
2972
2973 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2974 {
2975         mask->nbits = nr_bits;
2976         mask->bits = bitmap_zalloc(mask->nbits);
2977         if (!mask->bits)
2978                 return -ENOMEM;
2979
2980         return 0;
2981 }
2982
2983 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2984 {
2985         bitmap_free(mask->bits);
2986         mask->nbits = 0;
2987 }
2988
2989 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2990 {
2991         int ret;
2992
2993         ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2994         if (ret) {
2995                 mask->affinity.bits = NULL;
2996                 return ret;
2997         }
2998
2999         ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3000         if (ret) {
3001                 record__mmap_cpu_mask_free(&mask->maps);
3002                 mask->maps.bits = NULL;
3003         }
3004
3005         return ret;
3006 }
3007
3008 static void record__thread_mask_free(struct thread_mask *mask)
3009 {
3010         record__mmap_cpu_mask_free(&mask->maps);
3011         record__mmap_cpu_mask_free(&mask->affinity);
3012 }
3013
3014 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3015 {
3016         int s;
3017         struct record_opts *opts = opt->value;
3018
3019         if (unset || !str || !strlen(str)) {
3020                 opts->threads_spec = THREAD_SPEC__CPU;
3021         } else {
3022                 for (s = 1; s < THREAD_SPEC__MAX; s++) {
3023                         if (s == THREAD_SPEC__USER) {
3024                                 opts->threads_user_spec = strdup(str);
3025                                 if (!opts->threads_user_spec)
3026                                         return -ENOMEM;
3027                                 opts->threads_spec = THREAD_SPEC__USER;
3028                                 break;
3029                         }
3030                         if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3031                                 opts->threads_spec = s;
3032                                 break;
3033                         }
3034                 }
3035         }
3036
3037         if (opts->threads_spec == THREAD_SPEC__USER)
3038                 pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3039         else
3040                 pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3041
3042         return 0;
3043 }
3044
3045 static int parse_output_max_size(const struct option *opt,
3046                                  const char *str, int unset)
3047 {
3048         unsigned long *s = (unsigned long *)opt->value;
3049         static struct parse_tag tags_size[] = {
3050                 { .tag  = 'B', .mult = 1       },
3051                 { .tag  = 'K', .mult = 1 << 10 },
3052                 { .tag  = 'M', .mult = 1 << 20 },
3053                 { .tag  = 'G', .mult = 1 << 30 },
3054                 { .tag  = 0 },
3055         };
3056         unsigned long val;
3057
3058         if (unset) {
3059                 *s = 0;
3060                 return 0;
3061         }
3062
3063         val = parse_tag_value(str, tags_size);
3064         if (val != (unsigned long) -1) {
3065                 *s = val;
3066                 return 0;
3067         }
3068
3069         return -1;
3070 }
3071
3072 static int record__parse_mmap_pages(const struct option *opt,
3073                                     const char *str,
3074                                     int unset __maybe_unused)
3075 {
3076         struct record_opts *opts = opt->value;
3077         char *s, *p;
3078         unsigned int mmap_pages;
3079         int ret;
3080
3081         if (!str)
3082                 return -EINVAL;
3083
3084         s = strdup(str);
3085         if (!s)
3086                 return -ENOMEM;
3087
3088         p = strchr(s, ',');
3089         if (p)
3090                 *p = '\0';
3091
3092         if (*s) {
3093                 ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3094                 if (ret)
3095                         goto out_free;
3096                 opts->mmap_pages = mmap_pages;
3097         }
3098
3099         if (!p) {
3100                 ret = 0;
3101                 goto out_free;
3102         }
3103
3104         ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3105         if (ret)
3106                 goto out_free;
3107
3108         opts->auxtrace_mmap_pages = mmap_pages;
3109
3110 out_free:
3111         free(s);
3112         return ret;
3113 }
3114
3115 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3116 {
3117 }
3118
3119 static int parse_control_option(const struct option *opt,
3120                                 const char *str,
3121                                 int unset __maybe_unused)
3122 {
3123         struct record_opts *opts = opt->value;
3124
3125         return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3126 }
3127
3128 static void switch_output_size_warn(struct record *rec)
3129 {
3130         u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3131         struct switch_output *s = &rec->switch_output;
3132
3133         wakeup_size /= 2;
3134
3135         if (s->size < wakeup_size) {
3136                 char buf[100];
3137
3138                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3139                 pr_warning("WARNING: switch-output data size lower than "
3140                            "wakeup kernel buffer size (%s) "
3141                            "expect bigger perf.data sizes\n", buf);
3142         }
3143 }
3144
3145 static int switch_output_setup(struct record *rec)
3146 {
3147         struct switch_output *s = &rec->switch_output;
3148         static struct parse_tag tags_size[] = {
3149                 { .tag  = 'B', .mult = 1       },
3150                 { .tag  = 'K', .mult = 1 << 10 },
3151                 { .tag  = 'M', .mult = 1 << 20 },
3152                 { .tag  = 'G', .mult = 1 << 30 },
3153                 { .tag  = 0 },
3154         };
3155         static struct parse_tag tags_time[] = {
3156                 { .tag  = 's', .mult = 1        },
3157                 { .tag  = 'm', .mult = 60       },
3158                 { .tag  = 'h', .mult = 60*60    },
3159                 { .tag  = 'd', .mult = 60*60*24 },
3160                 { .tag  = 0 },
3161         };
3162         unsigned long val;
3163
3164         /*
3165          * If we're using --switch-output-events, then we imply its 
3166          * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3167          *  thread to its parent.
3168          */
3169         if (rec->switch_output_event_set) {
3170                 if (record__threads_enabled(rec)) {
3171                         pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3172                         return 0;
3173                 }
3174                 goto do_signal;
3175         }
3176
3177         if (!s->set)
3178                 return 0;
3179
3180         if (record__threads_enabled(rec)) {
3181                 pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3182                 return 0;
3183         }
3184
3185         if (!strcmp(s->str, "signal")) {
3186 do_signal:
3187                 s->signal = true;
3188                 pr_debug("switch-output with SIGUSR2 signal\n");
3189                 goto enabled;
3190         }
3191
3192         val = parse_tag_value(s->str, tags_size);
3193         if (val != (unsigned long) -1) {
3194                 s->size = val;
3195                 pr_debug("switch-output with %s size threshold\n", s->str);
3196                 goto enabled;
3197         }
3198
3199         val = parse_tag_value(s->str, tags_time);
3200         if (val != (unsigned long) -1) {
3201                 s->time = val;
3202                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3203                          s->str, s->time);
3204                 goto enabled;
3205         }
3206
3207         return -1;
3208
3209 enabled:
3210         rec->timestamp_filename = true;
3211         s->enabled              = true;
3212
3213         if (s->size && !rec->opts.no_buffering)
3214                 switch_output_size_warn(rec);
3215
3216         return 0;
3217 }
3218
3219 static const char * const __record_usage[] = {
3220         "perf record [<options>] [<command>]",
3221         "perf record [<options>] -- <command> [<options>]",
3222         NULL
3223 };
3224 const char * const *record_usage = __record_usage;
3225
3226 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3227                                   struct perf_sample *sample, struct machine *machine)
3228 {
3229         /*
3230          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3231          * no need to add them twice.
3232          */
3233         if (!(event->header.misc & PERF_RECORD_MISC_USER))
3234                 return 0;
3235         return perf_event__process_mmap(tool, event, sample, machine);
3236 }
3237
3238 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3239                                    struct perf_sample *sample, struct machine *machine)
3240 {
3241         /*
3242          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3243          * no need to add them twice.
3244          */
3245         if (!(event->header.misc & PERF_RECORD_MISC_USER))
3246                 return 0;
3247
3248         return perf_event__process_mmap2(tool, event, sample, machine);
3249 }
3250
3251 static int process_timestamp_boundary(struct perf_tool *tool,
3252                                       union perf_event *event __maybe_unused,
3253                                       struct perf_sample *sample,
3254                                       struct machine *machine __maybe_unused)
3255 {
3256         struct record *rec = container_of(tool, struct record, tool);
3257
3258         set_timestamp_boundary(rec, sample->time);
3259         return 0;
3260 }
3261
3262 static int parse_record_synth_option(const struct option *opt,
3263                                      const char *str,
3264                                      int unset __maybe_unused)
3265 {
3266         struct record_opts *opts = opt->value;
3267         char *p = strdup(str);
3268
3269         if (p == NULL)
3270                 return -1;
3271
3272         opts->synth = parse_synth_opt(p);
3273         free(p);
3274
3275         if (opts->synth < 0) {
3276                 pr_err("Invalid synth option: %s\n", str);
3277                 return -1;
3278         }
3279         return 0;
3280 }
3281
3282 /*
3283  * XXX Ideally would be local to cmd_record() and passed to a record__new
3284  * because we need to have access to it in record__exit, that is called
3285  * after cmd_record() exits, but since record_options need to be accessible to
3286  * builtin-script, leave it here.
3287  *
3288  * At least we don't ouch it in all the other functions here directly.
3289  *
3290  * Just say no to tons of global variables, sigh.
3291  */
3292 static struct record record = {
3293         .opts = {
3294                 .sample_time         = true,
3295                 .mmap_pages          = UINT_MAX,
3296                 .user_freq           = UINT_MAX,
3297                 .user_interval       = ULLONG_MAX,
3298                 .freq                = 4000,
3299                 .target              = {
3300                         .uses_mmap   = true,
3301                         .default_per_cpu = true,
3302                 },
3303                 .mmap_flush          = MMAP_FLUSH_DEFAULT,
3304                 .nr_threads_synthesize = 1,
3305                 .ctl_fd              = -1,
3306                 .ctl_fd_ack          = -1,
3307                 .synth               = PERF_SYNTH_ALL,
3308         },
3309         .tool = {
3310                 .sample         = process_sample_event,
3311                 .fork           = perf_event__process_fork,
3312                 .exit           = perf_event__process_exit,
3313                 .comm           = perf_event__process_comm,
3314                 .namespaces     = perf_event__process_namespaces,
3315                 .mmap           = build_id__process_mmap,
3316                 .mmap2          = build_id__process_mmap2,
3317                 .itrace_start   = process_timestamp_boundary,
3318                 .aux            = process_timestamp_boundary,
3319                 .ordered_events = true,
3320         },
3321 };
3322
3323 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3324         "\n\t\t\t\tDefault: fp";
3325
3326 static bool dry_run;
3327
3328 /*
3329  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3330  * with it and switch to use the library functions in perf_evlist that came
3331  * from builtin-record.c, i.e. use record_opts,
3332  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3333  * using pipes, etc.
3334  */
3335 static struct option __record_options[] = {
3336         OPT_CALLBACK('e', "event", &record.evlist, "event",
3337                      "event selector. use 'perf list' to list available events",
3338                      parse_events_option),
3339         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3340                      "event filter", parse_filter),
3341         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3342                            NULL, "don't record events from perf itself",
3343                            exclude_perf),
3344         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3345                     "record events on existing process id"),
3346         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3347                     "record events on existing thread id"),
3348         OPT_INTEGER('r', "realtime", &record.realtime_prio,
3349                     "collect data with this RT SCHED_FIFO priority"),
3350         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3351                     "collect data without buffering"),
3352         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3353                     "collect raw sample records from all opened counters"),
3354         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3355                             "system-wide collection from all CPUs"),
3356         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3357                     "list of cpus to monitor"),
3358         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3359         OPT_STRING('o', "output", &record.data.path, "file",
3360                     "output file name"),
3361         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3362                         &record.opts.no_inherit_set,
3363                         "child tasks do not inherit counters"),
3364         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3365                     "synthesize non-sample events at the end of output"),
3366         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3367         OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3368         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3369                     "Fail if the specified frequency can't be used"),
3370         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3371                      "profile at this frequency",
3372                       record__parse_freq),
3373         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3374                      "number of mmap data pages and AUX area tracing mmap pages",
3375                      record__parse_mmap_pages),
3376         OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3377                      "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3378                      record__mmap_flush_parse),
3379         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3380                            NULL, "enables call-graph recording" ,
3381                            &record_callchain_opt),
3382         OPT_CALLBACK(0, "call-graph", &record.opts,
3383                      "record_mode[,record_size]", record_callchain_help,
3384                      &record_parse_callchain_opt),
3385         OPT_INCR('v', "verbose", &verbose,
3386                     "be more verbose (show counter open errors, etc)"),
3387         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3388         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3389                     "per thread counts"),
3390         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3391         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3392                     "Record the sample physical addresses"),
3393         OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3394                     "Record the sampled data address data page size"),
3395         OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3396                     "Record the sampled code address (ip) page size"),
3397         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3398         OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3399                     "Record the sample identifier"),
3400         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3401                         &record.opts.sample_time_set,
3402                         "Record the sample timestamps"),
3403         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3404                         "Record the sample period"),
3405         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3406                     "don't sample"),
3407         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3408                         &record.no_buildid_cache_set,
3409                         "do not update the buildid cache"),
3410         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3411                         &record.no_buildid_set,
3412                         "do not collect buildids in perf.data"),
3413         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3414                      "monitor event in cgroup name only",
3415                      parse_cgroups),
3416         OPT_CALLBACK('D', "delay", &record, "ms",
3417                      "ms to wait before starting measurement after program start (-1: start with events disabled), "
3418                      "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3419                      record__parse_event_enable_time),
3420         OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3421         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3422                    "user to profile"),
3423
3424         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3425                      "branch any", "sample any taken branches",
3426                      parse_branch_stack),
3427
3428         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3429                      "branch filter mask", "branch stack filter modes",
3430                      parse_branch_stack),
3431         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3432                     "sample by weight (on special events only)"),
3433         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3434                     "sample transaction flags (special events only)"),
3435         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3436                     "use per-thread mmaps"),
3437         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3438                     "sample selected machine registers on interrupt,"
3439                     " use '-I?' to list register names", parse_intr_regs),
3440         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3441                     "sample selected machine registers on interrupt,"
3442                     " use '--user-regs=?' to list register names", parse_user_regs),
3443         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3444                     "Record running/enabled time of read (:S) events"),
3445         OPT_CALLBACK('k', "clockid", &record.opts,
3446         "clockid", "clockid to use for events, see clock_gettime()",
3447         parse_clockid),
3448         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3449                           "opts", "AUX area tracing Snapshot Mode", ""),
3450         OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3451                           "opts", "sample AUX area", ""),
3452         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3453                         "per thread proc mmap processing timeout in ms"),
3454         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3455                     "Record namespaces events"),
3456         OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3457                     "Record cgroup events"),
3458         OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3459                         &record.opts.record_switch_events_set,
3460                         "Record context switch events"),
3461         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3462                          "Configure all used events to run in kernel space.",
3463                          PARSE_OPT_EXCLUSIVE),
3464         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3465                          "Configure all used events to run in user space.",
3466                          PARSE_OPT_EXCLUSIVE),
3467         OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3468                     "collect kernel callchains"),
3469         OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3470                     "collect user callchains"),
3471         OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
3472                    "clang binary to use for compiling BPF scriptlets"),
3473         OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
3474                    "options passed to clang when compiling BPF scriptlets"),
3475         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3476                    "file", "vmlinux pathname"),
3477         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3478                     "Record build-id of all DSOs regardless of hits"),
3479         OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3480                     "Record build-id in map events"),
3481         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3482                     "append timestamp to output filename"),
3483         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3484                     "Record timestamp boundary (time of first/last samples)"),
3485         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3486                           &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3487                           "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3488                           "signal"),
3489         OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
3490                          "switch output event selector. use 'perf list' to list available events",
3491                          parse_events_option_new_evlist),
3492         OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3493                    "Limit number of switch output generated files"),
3494         OPT_BOOLEAN(0, "dry-run", &dry_run,
3495                     "Parse options then exit"),
3496 #ifdef HAVE_AIO_SUPPORT
3497         OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3498                      &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3499                      record__aio_parse),
3500 #endif
3501         OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3502                      "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3503                      record__parse_affinity),
3504 #ifdef HAVE_ZSTD_SUPPORT
3505         OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3506                             "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3507                             record__parse_comp_level),
3508 #endif
3509         OPT_CALLBACK(0, "max-size", &record.output_max_size,
3510                      "size", "Limit the maximum size of the output file", parse_output_max_size),
3511         OPT_UINTEGER(0, "num-thread-synthesize",
3512                      &record.opts.nr_threads_synthesize,
3513                      "number of threads to run for event synthesis"),
3514 #ifdef HAVE_LIBPFM
3515         OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3516                 "libpfm4 event selector. use 'perf list' to list available events",
3517                 parse_libpfm_events_option),
3518 #endif
3519         OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3520                      "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3521                      "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3522                      "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3523                      "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3524                       parse_control_option),
3525         OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3526                      "Fine-tune event synthesis: default=all", parse_record_synth_option),
3527         OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3528                           &record.debuginfod.set, "debuginfod urls",
3529                           "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3530                           "system"),
3531         OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3532                             "write collected trace data into several data files using parallel threads",
3533                             record__parse_threads),
3534         OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3535         OPT_END()
3536 };
3537
3538 struct option *record_options = __record_options;
3539
3540 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3541 {
3542         struct perf_cpu cpu;
3543         int idx;
3544
3545         if (cpu_map__is_dummy(cpus))
3546                 return 0;
3547
3548         perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3549                 if (cpu.cpu == -1)
3550                         continue;
3551                 /* Return ENODEV is input cpu is greater than max cpu */
3552                 if ((unsigned long)cpu.cpu > mask->nbits)
3553                         return -ENODEV;
3554                 __set_bit(cpu.cpu, mask->bits);
3555         }
3556
3557         return 0;
3558 }
3559
3560 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3561 {
3562         struct perf_cpu_map *cpus;
3563
3564         cpus = perf_cpu_map__new(mask_spec);
3565         if (!cpus)
3566                 return -ENOMEM;
3567
3568         bitmap_zero(mask->bits, mask->nbits);
3569         if (record__mmap_cpu_mask_init(mask, cpus))
3570                 return -ENODEV;
3571
3572         perf_cpu_map__put(cpus);
3573
3574         return 0;
3575 }
3576
3577 static void record__free_thread_masks(struct record *rec, int nr_threads)
3578 {
3579         int t;
3580
3581         if (rec->thread_masks)
3582                 for (t = 0; t < nr_threads; t++)
3583                         record__thread_mask_free(&rec->thread_masks[t]);
3584
3585         zfree(&rec->thread_masks);
3586 }
3587
3588 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3589 {
3590         int t, ret;
3591
3592         rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3593         if (!rec->thread_masks) {
3594                 pr_err("Failed to allocate thread masks\n");
3595                 return -ENOMEM;
3596         }
3597
3598         for (t = 0; t < nr_threads; t++) {
3599                 ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3600                 if (ret) {
3601                         pr_err("Failed to allocate thread masks[%d]\n", t);
3602                         goto out_free;
3603                 }
3604         }
3605
3606         return 0;
3607
3608 out_free:
3609         record__free_thread_masks(rec, nr_threads);
3610
3611         return ret;
3612 }
3613
3614 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3615 {
3616         int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3617
3618         ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3619         if (ret)
3620                 return ret;
3621
3622         rec->nr_threads = nr_cpus;
3623         pr_debug("nr_threads: %d\n", rec->nr_threads);
3624
3625         for (t = 0; t < rec->nr_threads; t++) {
3626                 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3627                 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3628                 if (verbose > 0) {
3629                         pr_debug("thread_masks[%d]: ", t);
3630                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3631                         pr_debug("thread_masks[%d]: ", t);
3632                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3633                 }
3634         }
3635
3636         return 0;
3637 }
3638
3639 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3640                                           const char **maps_spec, const char **affinity_spec,
3641                                           u32 nr_spec)
3642 {
3643         u32 s;
3644         int ret = 0, t = 0;
3645         struct mmap_cpu_mask cpus_mask;
3646         struct thread_mask thread_mask, full_mask, *thread_masks;
3647
3648         ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3649         if (ret) {
3650                 pr_err("Failed to allocate CPUs mask\n");
3651                 return ret;
3652         }
3653
3654         ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3655         if (ret) {
3656                 pr_err("Failed to init cpu mask\n");
3657                 goto out_free_cpu_mask;
3658         }
3659
3660         ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3661         if (ret) {
3662                 pr_err("Failed to allocate full mask\n");
3663                 goto out_free_cpu_mask;
3664         }
3665
3666         ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3667         if (ret) {
3668                 pr_err("Failed to allocate thread mask\n");
3669                 goto out_free_full_and_cpu_masks;
3670         }
3671
3672         for (s = 0; s < nr_spec; s++) {
3673                 ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3674                 if (ret) {
3675                         pr_err("Failed to initialize maps thread mask\n");
3676                         goto out_free;
3677                 }
3678                 ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3679                 if (ret) {
3680                         pr_err("Failed to initialize affinity thread mask\n");
3681                         goto out_free;
3682                 }
3683
3684                 /* ignore invalid CPUs but do not allow empty masks */
3685                 if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3686                                 cpus_mask.bits, thread_mask.maps.nbits)) {
3687                         pr_err("Empty maps mask: %s\n", maps_spec[s]);
3688                         ret = -EINVAL;
3689                         goto out_free;
3690                 }
3691                 if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3692                                 cpus_mask.bits, thread_mask.affinity.nbits)) {
3693                         pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3694                         ret = -EINVAL;
3695                         goto out_free;
3696                 }
3697
3698                 /* do not allow intersection with other masks (full_mask) */
3699                 if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3700                                       thread_mask.maps.nbits)) {
3701                         pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3702                         ret = -EINVAL;
3703                         goto out_free;
3704                 }
3705                 if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3706                                       thread_mask.affinity.nbits)) {
3707                         pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3708                         ret = -EINVAL;
3709                         goto out_free;
3710                 }
3711
3712                 bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3713                           thread_mask.maps.bits, full_mask.maps.nbits);
3714                 bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3715                           thread_mask.affinity.bits, full_mask.maps.nbits);
3716
3717                 thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3718                 if (!thread_masks) {
3719                         pr_err("Failed to reallocate thread masks\n");
3720                         ret = -ENOMEM;
3721                         goto out_free;
3722                 }
3723                 rec->thread_masks = thread_masks;
3724                 rec->thread_masks[t] = thread_mask;
3725                 if (verbose > 0) {
3726                         pr_debug("thread_masks[%d]: ", t);
3727                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3728                         pr_debug("thread_masks[%d]: ", t);
3729                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3730                 }
3731                 t++;
3732                 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3733                 if (ret) {
3734                         pr_err("Failed to allocate thread mask\n");
3735                         goto out_free_full_and_cpu_masks;
3736                 }
3737         }
3738         rec->nr_threads = t;
3739         pr_debug("nr_threads: %d\n", rec->nr_threads);
3740         if (!rec->nr_threads)
3741                 ret = -EINVAL;
3742
3743 out_free:
3744         record__thread_mask_free(&thread_mask);
3745 out_free_full_and_cpu_masks:
3746         record__thread_mask_free(&full_mask);
3747 out_free_cpu_mask:
3748         record__mmap_cpu_mask_free(&cpus_mask);
3749
3750         return ret;
3751 }
3752
3753 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3754 {
3755         int ret;
3756         struct cpu_topology *topo;
3757
3758         topo = cpu_topology__new();
3759         if (!topo) {
3760                 pr_err("Failed to allocate CPU topology\n");
3761                 return -ENOMEM;
3762         }
3763
3764         ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3765                                              topo->core_cpus_list, topo->core_cpus_lists);
3766         cpu_topology__delete(topo);
3767
3768         return ret;
3769 }
3770
3771 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3772 {
3773         int ret;
3774         struct cpu_topology *topo;
3775
3776         topo = cpu_topology__new();
3777         if (!topo) {
3778                 pr_err("Failed to allocate CPU topology\n");
3779                 return -ENOMEM;
3780         }
3781
3782         ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3783                                              topo->package_cpus_list, topo->package_cpus_lists);
3784         cpu_topology__delete(topo);
3785
3786         return ret;
3787 }
3788
3789 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3790 {
3791         u32 s;
3792         int ret;
3793         const char **spec;
3794         struct numa_topology *topo;
3795
3796         topo = numa_topology__new();
3797         if (!topo) {
3798                 pr_err("Failed to allocate NUMA topology\n");
3799                 return -ENOMEM;
3800         }
3801
3802         spec = zalloc(topo->nr * sizeof(char *));
3803         if (!spec) {
3804                 pr_err("Failed to allocate NUMA spec\n");
3805                 ret = -ENOMEM;
3806                 goto out_delete_topo;
3807         }
3808         for (s = 0; s < topo->nr; s++)
3809                 spec[s] = topo->nodes[s].cpus;
3810
3811         ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3812
3813         zfree(&spec);
3814
3815 out_delete_topo:
3816         numa_topology__delete(topo);
3817
3818         return ret;
3819 }
3820
3821 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3822 {
3823         int t, ret;
3824         u32 s, nr_spec = 0;
3825         char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3826         char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3827
3828         for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3829                 spec = strtok_r(user_spec, ":", &spec_ptr);
3830                 if (spec == NULL)
3831                         break;
3832                 pr_debug2("threads_spec[%d]: %s\n", t, spec);
3833                 mask = strtok_r(spec, "/", &mask_ptr);
3834                 if (mask == NULL)
3835                         break;
3836                 pr_debug2("  maps mask: %s\n", mask);
3837                 tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3838                 if (!tmp_spec) {
3839                         pr_err("Failed to reallocate maps spec\n");
3840                         ret = -ENOMEM;
3841                         goto out_free;
3842                 }
3843                 maps_spec = tmp_spec;
3844                 maps_spec[nr_spec] = dup_mask = strdup(mask);
3845                 if (!maps_spec[nr_spec]) {
3846                         pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3847                         ret = -ENOMEM;
3848                         goto out_free;
3849                 }
3850                 mask = strtok_r(NULL, "/", &mask_ptr);
3851                 if (mask == NULL) {
3852                         pr_err("Invalid thread maps or affinity specs\n");
3853                         ret = -EINVAL;
3854                         goto out_free;
3855                 }
3856                 pr_debug2("  affinity mask: %s\n", mask);
3857                 tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3858                 if (!tmp_spec) {
3859                         pr_err("Failed to reallocate affinity spec\n");
3860                         ret = -ENOMEM;
3861                         goto out_free;
3862                 }
3863                 affinity_spec = tmp_spec;
3864                 affinity_spec[nr_spec] = strdup(mask);
3865                 if (!affinity_spec[nr_spec]) {
3866                         pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3867                         ret = -ENOMEM;
3868                         goto out_free;
3869                 }
3870                 dup_mask = NULL;
3871                 nr_spec++;
3872         }
3873
3874         ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3875                                              (const char **)affinity_spec, nr_spec);
3876
3877 out_free:
3878         free(dup_mask);
3879         for (s = 0; s < nr_spec; s++) {
3880                 if (maps_spec)
3881                         free(maps_spec[s]);
3882                 if (affinity_spec)
3883                         free(affinity_spec[s]);
3884         }
3885         free(affinity_spec);
3886         free(maps_spec);
3887
3888         return ret;
3889 }
3890
3891 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3892 {
3893         int ret;
3894
3895         ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3896         if (ret)
3897                 return ret;
3898
3899         if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3900                 return -ENODEV;
3901
3902         rec->nr_threads = 1;
3903
3904         return 0;
3905 }
3906
3907 static int record__init_thread_masks(struct record *rec)
3908 {
3909         int ret = 0;
3910         struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3911
3912         if (!record__threads_enabled(rec))
3913                 return record__init_thread_default_masks(rec, cpus);
3914
3915         if (evlist__per_thread(rec->evlist)) {
3916                 pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3917                 return -EINVAL;
3918         }
3919
3920         switch (rec->opts.threads_spec) {
3921         case THREAD_SPEC__CPU:
3922                 ret = record__init_thread_cpu_masks(rec, cpus);
3923                 break;
3924         case THREAD_SPEC__CORE:
3925                 ret = record__init_thread_core_masks(rec, cpus);
3926                 break;
3927         case THREAD_SPEC__PACKAGE:
3928                 ret = record__init_thread_package_masks(rec, cpus);
3929                 break;
3930         case THREAD_SPEC__NUMA:
3931                 ret = record__init_thread_numa_masks(rec, cpus);
3932                 break;
3933         case THREAD_SPEC__USER:
3934                 ret = record__init_thread_user_masks(rec, cpus);
3935                 break;
3936         default:
3937                 break;
3938         }
3939
3940         return ret;
3941 }
3942
3943 int cmd_record(int argc, const char **argv)
3944 {
3945         int err;
3946         struct record *rec = &record;
3947         char errbuf[BUFSIZ];
3948
3949         setlocale(LC_ALL, "");
3950
3951 #ifndef HAVE_LIBBPF_SUPPORT
3952 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
3953         set_nobuild('\0', "clang-path", true);
3954         set_nobuild('\0', "clang-opt", true);
3955 # undef set_nobuild
3956 #endif
3957
3958 #ifndef HAVE_BPF_PROLOGUE
3959 # if !defined (HAVE_DWARF_SUPPORT)
3960 #  define REASON  "NO_DWARF=1"
3961 # elif !defined (HAVE_LIBBPF_SUPPORT)
3962 #  define REASON  "NO_LIBBPF=1"
3963 # else
3964 #  define REASON  "this architecture doesn't support BPF prologue"
3965 # endif
3966 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
3967         set_nobuild('\0', "vmlinux", true);
3968 # undef set_nobuild
3969 # undef REASON
3970 #endif
3971
3972 #ifndef HAVE_BPF_SKEL
3973 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3974         set_nobuild('\0', "off-cpu", "NO_BPF_SKEL=1", true);
3975 # undef set_nobuild
3976 #endif
3977
3978         rec->opts.affinity = PERF_AFFINITY_SYS;
3979
3980         rec->evlist = evlist__new();
3981         if (rec->evlist == NULL)
3982                 return -ENOMEM;
3983
3984         err = perf_config(perf_record_config, rec);
3985         if (err)
3986                 return err;
3987
3988         argc = parse_options(argc, argv, record_options, record_usage,
3989                             PARSE_OPT_STOP_AT_NON_OPTION);
3990         if (quiet)
3991                 perf_quiet_option();
3992
3993         err = symbol__validate_sym_arguments();
3994         if (err)
3995                 return err;
3996
3997         perf_debuginfod_setup(&record.debuginfod);
3998
3999         /* Make system wide (-a) the default target. */
4000         if (!argc && target__none(&rec->opts.target))
4001                 rec->opts.target.system_wide = true;
4002
4003         if (nr_cgroups && !rec->opts.target.system_wide) {
4004                 usage_with_options_msg(record_usage, record_options,
4005                         "cgroup monitoring only available in system-wide mode");
4006
4007         }
4008
4009         if (rec->buildid_mmap) {
4010                 if (!perf_can_record_build_id()) {
4011                         pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4012                         err = -EINVAL;
4013                         goto out_opts;
4014                 }
4015                 pr_debug("Enabling build id in mmap2 events.\n");
4016                 /* Enable mmap build id synthesizing. */
4017                 symbol_conf.buildid_mmap2 = true;
4018                 /* Enable perf_event_attr::build_id bit. */
4019                 rec->opts.build_id = true;
4020                 /* Disable build id cache. */
4021                 rec->no_buildid = true;
4022         }
4023
4024         if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4025                 pr_err("Kernel has no cgroup sampling support.\n");
4026                 err = -EINVAL;
4027                 goto out_opts;
4028         }
4029
4030         if (rec->opts.kcore)
4031                 rec->opts.text_poke = true;
4032
4033         if (rec->opts.kcore || record__threads_enabled(rec))
4034                 rec->data.is_dir = true;
4035
4036         if (record__threads_enabled(rec)) {
4037                 if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4038                         pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4039                         goto out_opts;
4040                 }
4041                 if (record__aio_enabled(rec)) {
4042                         pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4043                         goto out_opts;
4044                 }
4045         }
4046
4047         if (rec->opts.comp_level != 0) {
4048                 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4049                 rec->no_buildid = true;
4050         }
4051
4052         if (rec->opts.record_switch_events &&
4053             !perf_can_record_switch_events()) {
4054                 ui__error("kernel does not support recording context switch events\n");
4055                 parse_options_usage(record_usage, record_options, "switch-events", 0);
4056                 err = -EINVAL;
4057                 goto out_opts;
4058         }
4059
4060         if (switch_output_setup(rec)) {
4061                 parse_options_usage(record_usage, record_options, "switch-output", 0);
4062                 err = -EINVAL;
4063                 goto out_opts;
4064         }
4065
4066         if (rec->switch_output.time) {
4067                 signal(SIGALRM, alarm_sig_handler);
4068                 alarm(rec->switch_output.time);
4069         }
4070
4071         if (rec->switch_output.num_files) {
4072                 rec->switch_output.filenames = calloc(sizeof(char *),
4073                                                       rec->switch_output.num_files);
4074                 if (!rec->switch_output.filenames) {
4075                         err = -EINVAL;
4076                         goto out_opts;
4077                 }
4078         }
4079
4080         if (rec->timestamp_filename && record__threads_enabled(rec)) {
4081                 rec->timestamp_filename = false;
4082                 pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4083         }
4084
4085         /*
4086          * Allow aliases to facilitate the lookup of symbols for address
4087          * filters. Refer to auxtrace_parse_filters().
4088          */
4089         symbol_conf.allow_aliases = true;
4090
4091         symbol__init(NULL);
4092
4093         err = record__auxtrace_init(rec);
4094         if (err)
4095                 goto out;
4096
4097         if (dry_run)
4098                 goto out;
4099
4100         err = bpf__setup_stdout(rec->evlist);
4101         if (err) {
4102                 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
4103                 pr_err("ERROR: Setup BPF stdout failed: %s\n",
4104                          errbuf);
4105                 goto out;
4106         }
4107
4108         err = -ENOMEM;
4109
4110         if (rec->no_buildid_cache || rec->no_buildid) {
4111                 disable_buildid_cache();
4112         } else if (rec->switch_output.enabled) {
4113                 /*
4114                  * In 'perf record --switch-output', disable buildid
4115                  * generation by default to reduce data file switching
4116                  * overhead. Still generate buildid if they are required
4117                  * explicitly using
4118                  *
4119                  *  perf record --switch-output --no-no-buildid \
4120                  *              --no-no-buildid-cache
4121                  *
4122                  * Following code equals to:
4123                  *
4124                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
4125                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4126                  *         disable_buildid_cache();
4127                  */
4128                 bool disable = true;
4129
4130                 if (rec->no_buildid_set && !rec->no_buildid)
4131                         disable = false;
4132                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4133                         disable = false;
4134                 if (disable) {
4135                         rec->no_buildid = true;
4136                         rec->no_buildid_cache = true;
4137                         disable_buildid_cache();
4138                 }
4139         }
4140
4141         if (record.opts.overwrite)
4142                 record.opts.tail_synthesize = true;
4143
4144         if (rec->evlist->core.nr_entries == 0) {
4145                 if (perf_pmu__has_hybrid()) {
4146                         err = evlist__add_default_hybrid(rec->evlist,
4147                                                          !record.opts.no_samples);
4148                 } else {
4149                         err = __evlist__add_default(rec->evlist,
4150                                                     !record.opts.no_samples);
4151                 }
4152
4153                 if (err < 0) {
4154                         pr_err("Not enough memory for event selector list\n");
4155                         goto out;
4156                 }
4157         }
4158
4159         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4160                 rec->opts.no_inherit = true;
4161
4162         err = target__validate(&rec->opts.target);
4163         if (err) {
4164                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4165                 ui__warning("%s\n", errbuf);
4166         }
4167
4168         err = target__parse_uid(&rec->opts.target);
4169         if (err) {
4170                 int saved_errno = errno;
4171
4172                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4173                 ui__error("%s", errbuf);
4174
4175                 err = -saved_errno;
4176                 goto out;
4177         }
4178
4179         /* Enable ignoring missing threads when -u/-p option is defined. */
4180         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4181
4182         if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
4183                 pr_err("failed to use cpu list %s\n",
4184                        rec->opts.target.cpu_list);
4185                 goto out;
4186         }
4187
4188         rec->opts.target.hybrid = perf_pmu__has_hybrid();
4189
4190         if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4191                 arch__add_leaf_frame_record_opts(&rec->opts);
4192
4193         err = -ENOMEM;
4194         if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4195                 if (rec->opts.target.pid != NULL) {
4196                         pr_err("Couldn't create thread/CPU maps: %s\n",
4197                                 errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4198                         goto out;
4199                 }
4200                 else
4201                         usage_with_options(record_usage, record_options);
4202         }
4203
4204         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4205         if (err)
4206                 goto out;
4207
4208         /*
4209          * We take all buildids when the file contains
4210          * AUX area tracing data because we do not decode the
4211          * trace because it would take too long.
4212          */
4213         if (rec->opts.full_auxtrace)
4214                 rec->buildid_all = true;
4215
4216         if (rec->opts.text_poke) {
4217                 err = record__config_text_poke(rec->evlist);
4218                 if (err) {
4219                         pr_err("record__config_text_poke failed, error %d\n", err);
4220                         goto out;
4221                 }
4222         }
4223
4224         if (rec->off_cpu) {
4225                 err = record__config_off_cpu(rec);
4226                 if (err) {
4227                         pr_err("record__config_off_cpu failed, error %d\n", err);
4228                         goto out;
4229                 }
4230         }
4231
4232         if (record_opts__config(&rec->opts)) {
4233                 err = -EINVAL;
4234                 goto out;
4235         }
4236
4237         err = record__init_thread_masks(rec);
4238         if (err) {
4239                 pr_err("Failed to initialize parallel data streaming masks\n");
4240                 goto out;
4241         }
4242
4243         if (rec->opts.nr_cblocks > nr_cblocks_max)
4244                 rec->opts.nr_cblocks = nr_cblocks_max;
4245         pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4246
4247         pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4248         pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4249
4250         if (rec->opts.comp_level > comp_level_max)
4251                 rec->opts.comp_level = comp_level_max;
4252         pr_debug("comp level: %d\n", rec->opts.comp_level);
4253
4254         err = __cmd_record(&record, argc, argv);
4255 out:
4256         evlist__delete(rec->evlist);
4257         symbol__exit();
4258         auxtrace_record__free(rec->itr);
4259 out_opts:
4260         record__free_thread_masks(rec, rec->nr_threads);
4261         rec->nr_threads = 0;
4262         evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4263         return err;
4264 }
4265
4266 static void snapshot_sig_handler(int sig __maybe_unused)
4267 {
4268         struct record *rec = &record;
4269
4270         hit_auxtrace_snapshot_trigger(rec);
4271
4272         if (switch_output_signal(rec))
4273                 trigger_hit(&switch_output_trigger);
4274 }
4275
4276 static void alarm_sig_handler(int sig __maybe_unused)
4277 {
4278         struct record *rec = &record;
4279
4280         if (switch_output_time(rec))
4281                 trigger_hit(&switch_output_trigger);
4282 }