Linux 6.9-rc1
[linux-2.6-microblaze.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "asm/bug.h"
55 #include "perf.h"
56 #include "cputopo.h"
57
58 #include <errno.h>
59 #include <inttypes.h>
60 #include <locale.h>
61 #include <poll.h>
62 #include <pthread.h>
63 #include <unistd.h>
64 #ifndef HAVE_GETTID
65 #include <syscall.h>
66 #endif
67 #include <sched.h>
68 #include <signal.h>
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
71 #endif
72 #include <sys/mman.h>
73 #include <sys/wait.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <fcntl.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
82 #include <sys/time.h>
83
84 struct switch_output {
85         bool             enabled;
86         bool             signal;
87         unsigned long    size;
88         unsigned long    time;
89         const char      *str;
90         bool             set;
91         char             **filenames;
92         int              num_files;
93         int              cur_file;
94 };
95
96 struct thread_mask {
97         struct mmap_cpu_mask    maps;
98         struct mmap_cpu_mask    affinity;
99 };
100
101 struct record_thread {
102         pid_t                   tid;
103         struct thread_mask      *mask;
104         struct {
105                 int             msg[2];
106                 int             ack[2];
107         } pipes;
108         struct fdarray          pollfd;
109         int                     ctlfd_pos;
110         int                     nr_mmaps;
111         struct mmap             **maps;
112         struct mmap             **overwrite_maps;
113         struct record           *rec;
114         unsigned long long      samples;
115         unsigned long           waking;
116         u64                     bytes_written;
117         u64                     bytes_transferred;
118         u64                     bytes_compressed;
119 };
120
121 static __thread struct record_thread *thread;
122
123 enum thread_msg {
124         THREAD_MSG__UNDEFINED = 0,
125         THREAD_MSG__READY,
126         THREAD_MSG__MAX,
127 };
128
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130         "UNDEFINED", "READY"
131 };
132
133 enum thread_spec {
134         THREAD_SPEC__UNDEFINED = 0,
135         THREAD_SPEC__CPU,
136         THREAD_SPEC__CORE,
137         THREAD_SPEC__PACKAGE,
138         THREAD_SPEC__NUMA,
139         THREAD_SPEC__USER,
140         THREAD_SPEC__MAX,
141 };
142
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144         "undefined", "cpu", "core", "package", "numa", "user"
145 };
146
147 struct pollfd_index_map {
148         int evlist_pollfd_index;
149         int thread_pollfd_index;
150 };
151
152 struct record {
153         struct perf_tool        tool;
154         struct record_opts      opts;
155         u64                     bytes_written;
156         u64                     thread_bytes_written;
157         struct perf_data        data;
158         struct auxtrace_record  *itr;
159         struct evlist   *evlist;
160         struct perf_session     *session;
161         struct evlist           *sb_evlist;
162         pthread_t               thread_id;
163         int                     realtime_prio;
164         bool                    switch_output_event_set;
165         bool                    no_buildid;
166         bool                    no_buildid_set;
167         bool                    no_buildid_cache;
168         bool                    no_buildid_cache_set;
169         bool                    buildid_all;
170         bool                    buildid_mmap;
171         bool                    timestamp_filename;
172         bool                    timestamp_boundary;
173         bool                    off_cpu;
174         struct switch_output    switch_output;
175         unsigned long long      samples;
176         unsigned long           output_max_size;        /* = 0: unlimited */
177         struct perf_debuginfod  debuginfod;
178         int                     nr_threads;
179         struct thread_mask      *thread_masks;
180         struct record_thread    *thread_data;
181         struct pollfd_index_map *index_map;
182         size_t                  index_map_sz;
183         size_t                  index_map_cnt;
184 };
185
186 static volatile int done;
187
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193         "SYS", "NODE", "CPU"
194 };
195
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199         return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202
203 static int record__threads_enabled(struct record *rec)
204 {
205         return rec->opts.threads_spec;
206 }
207
208 static bool switch_output_signal(struct record *rec)
209 {
210         return rec->switch_output.signal &&
211                trigger_is_ready(&switch_output_trigger);
212 }
213
214 static bool switch_output_size(struct record *rec)
215 {
216         return rec->switch_output.size &&
217                trigger_is_ready(&switch_output_trigger) &&
218                (rec->bytes_written >= rec->switch_output.size);
219 }
220
221 static bool switch_output_time(struct record *rec)
222 {
223         return rec->switch_output.time &&
224                trigger_is_ready(&switch_output_trigger);
225 }
226
227 static u64 record__bytes_written(struct record *rec)
228 {
229         return rec->bytes_written + rec->thread_bytes_written;
230 }
231
232 static bool record__output_max_size_exceeded(struct record *rec)
233 {
234         return rec->output_max_size &&
235                (record__bytes_written(rec) >= rec->output_max_size);
236 }
237
238 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
239                          void *bf, size_t size)
240 {
241         struct perf_data_file *file = &rec->session->data->file;
242
243         if (map && map->file)
244                 file = map->file;
245
246         if (perf_data_file__write(file, bf, size) < 0) {
247                 pr_err("failed to write perf data, error: %m\n");
248                 return -1;
249         }
250
251         if (map && map->file) {
252                 thread->bytes_written += size;
253                 rec->thread_bytes_written += size;
254         } else {
255                 rec->bytes_written += size;
256         }
257
258         if (record__output_max_size_exceeded(rec) && !done) {
259                 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
260                                 " stopping session ]\n",
261                                 record__bytes_written(rec) >> 10);
262                 done = 1;
263         }
264
265         if (switch_output_size(rec))
266                 trigger_hit(&switch_output_trigger);
267
268         return 0;
269 }
270
271 static int record__aio_enabled(struct record *rec);
272 static int record__comp_enabled(struct record *rec);
273 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
274                             void *dst, size_t dst_size, void *src, size_t src_size);
275
276 #ifdef HAVE_AIO_SUPPORT
277 static int record__aio_write(struct aiocb *cblock, int trace_fd,
278                 void *buf, size_t size, off_t off)
279 {
280         int rc;
281
282         cblock->aio_fildes = trace_fd;
283         cblock->aio_buf    = buf;
284         cblock->aio_nbytes = size;
285         cblock->aio_offset = off;
286         cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
287
288         do {
289                 rc = aio_write(cblock);
290                 if (rc == 0) {
291                         break;
292                 } else if (errno != EAGAIN) {
293                         cblock->aio_fildes = -1;
294                         pr_err("failed to queue perf data, error: %m\n");
295                         break;
296                 }
297         } while (1);
298
299         return rc;
300 }
301
302 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
303 {
304         void *rem_buf;
305         off_t rem_off;
306         size_t rem_size;
307         int rc, aio_errno;
308         ssize_t aio_ret, written;
309
310         aio_errno = aio_error(cblock);
311         if (aio_errno == EINPROGRESS)
312                 return 0;
313
314         written = aio_ret = aio_return(cblock);
315         if (aio_ret < 0) {
316                 if (aio_errno != EINTR)
317                         pr_err("failed to write perf data, error: %m\n");
318                 written = 0;
319         }
320
321         rem_size = cblock->aio_nbytes - written;
322
323         if (rem_size == 0) {
324                 cblock->aio_fildes = -1;
325                 /*
326                  * md->refcount is incremented in record__aio_pushfn() for
327                  * every aio write request started in record__aio_push() so
328                  * decrement it because the request is now complete.
329                  */
330                 perf_mmap__put(&md->core);
331                 rc = 1;
332         } else {
333                 /*
334                  * aio write request may require restart with the
335                  * reminder if the kernel didn't write whole
336                  * chunk at once.
337                  */
338                 rem_off = cblock->aio_offset + written;
339                 rem_buf = (void *)(cblock->aio_buf + written);
340                 record__aio_write(cblock, cblock->aio_fildes,
341                                 rem_buf, rem_size, rem_off);
342                 rc = 0;
343         }
344
345         return rc;
346 }
347
348 static int record__aio_sync(struct mmap *md, bool sync_all)
349 {
350         struct aiocb **aiocb = md->aio.aiocb;
351         struct aiocb *cblocks = md->aio.cblocks;
352         struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
353         int i, do_suspend;
354
355         do {
356                 do_suspend = 0;
357                 for (i = 0; i < md->aio.nr_cblocks; ++i) {
358                         if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
359                                 if (sync_all)
360                                         aiocb[i] = NULL;
361                                 else
362                                         return i;
363                         } else {
364                                 /*
365                                  * Started aio write is not complete yet
366                                  * so it has to be waited before the
367                                  * next allocation.
368                                  */
369                                 aiocb[i] = &cblocks[i];
370                                 do_suspend = 1;
371                         }
372                 }
373                 if (!do_suspend)
374                         return -1;
375
376                 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
377                         if (!(errno == EAGAIN || errno == EINTR))
378                                 pr_err("failed to sync perf data, error: %m\n");
379                 }
380         } while (1);
381 }
382
383 struct record_aio {
384         struct record   *rec;
385         void            *data;
386         size_t          size;
387 };
388
389 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
390 {
391         struct record_aio *aio = to;
392
393         /*
394          * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
395          * to release space in the kernel buffer as fast as possible, calling
396          * perf_mmap__consume() from perf_mmap__push() function.
397          *
398          * That lets the kernel to proceed with storing more profiling data into
399          * the kernel buffer earlier than other per-cpu kernel buffers are handled.
400          *
401          * Coping can be done in two steps in case the chunk of profiling data
402          * crosses the upper bound of the kernel buffer. In this case we first move
403          * part of data from map->start till the upper bound and then the reminder
404          * from the beginning of the kernel buffer till the end of the data chunk.
405          */
406
407         if (record__comp_enabled(aio->rec)) {
408                 ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
409                                                    mmap__mmap_len(map) - aio->size,
410                                                    buf, size);
411                 if (compressed < 0)
412                         return (int)compressed;
413
414                 size = compressed;
415         } else {
416                 memcpy(aio->data + aio->size, buf, size);
417         }
418
419         if (!aio->size) {
420                 /*
421                  * Increment map->refcount to guard map->aio.data[] buffer
422                  * from premature deallocation because map object can be
423                  * released earlier than aio write request started on
424                  * map->aio.data[] buffer is complete.
425                  *
426                  * perf_mmap__put() is done at record__aio_complete()
427                  * after started aio request completion or at record__aio_push()
428                  * if the request failed to start.
429                  */
430                 perf_mmap__get(&map->core);
431         }
432
433         aio->size += size;
434
435         return size;
436 }
437
438 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
439 {
440         int ret, idx;
441         int trace_fd = rec->session->data->file.fd;
442         struct record_aio aio = { .rec = rec, .size = 0 };
443
444         /*
445          * Call record__aio_sync() to wait till map->aio.data[] buffer
446          * becomes available after previous aio write operation.
447          */
448
449         idx = record__aio_sync(map, false);
450         aio.data = map->aio.data[idx];
451         ret = perf_mmap__push(map, &aio, record__aio_pushfn);
452         if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
453                 return ret;
454
455         rec->samples++;
456         ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
457         if (!ret) {
458                 *off += aio.size;
459                 rec->bytes_written += aio.size;
460                 if (switch_output_size(rec))
461                         trigger_hit(&switch_output_trigger);
462         } else {
463                 /*
464                  * Decrement map->refcount incremented in record__aio_pushfn()
465                  * back if record__aio_write() operation failed to start, otherwise
466                  * map->refcount is decremented in record__aio_complete() after
467                  * aio write operation finishes successfully.
468                  */
469                 perf_mmap__put(&map->core);
470         }
471
472         return ret;
473 }
474
475 static off_t record__aio_get_pos(int trace_fd)
476 {
477         return lseek(trace_fd, 0, SEEK_CUR);
478 }
479
480 static void record__aio_set_pos(int trace_fd, off_t pos)
481 {
482         lseek(trace_fd, pos, SEEK_SET);
483 }
484
485 static void record__aio_mmap_read_sync(struct record *rec)
486 {
487         int i;
488         struct evlist *evlist = rec->evlist;
489         struct mmap *maps = evlist->mmap;
490
491         if (!record__aio_enabled(rec))
492                 return;
493
494         for (i = 0; i < evlist->core.nr_mmaps; i++) {
495                 struct mmap *map = &maps[i];
496
497                 if (map->core.base)
498                         record__aio_sync(map, true);
499         }
500 }
501
502 static int nr_cblocks_default = 1;
503 static int nr_cblocks_max = 4;
504
505 static int record__aio_parse(const struct option *opt,
506                              const char *str,
507                              int unset)
508 {
509         struct record_opts *opts = (struct record_opts *)opt->value;
510
511         if (unset) {
512                 opts->nr_cblocks = 0;
513         } else {
514                 if (str)
515                         opts->nr_cblocks = strtol(str, NULL, 0);
516                 if (!opts->nr_cblocks)
517                         opts->nr_cblocks = nr_cblocks_default;
518         }
519
520         return 0;
521 }
522 #else /* HAVE_AIO_SUPPORT */
523 static int nr_cblocks_max = 0;
524
525 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
526                             off_t *off __maybe_unused)
527 {
528         return -1;
529 }
530
531 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
532 {
533         return -1;
534 }
535
536 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
537 {
538 }
539
540 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
541 {
542 }
543 #endif
544
545 static int record__aio_enabled(struct record *rec)
546 {
547         return rec->opts.nr_cblocks > 0;
548 }
549
550 #define MMAP_FLUSH_DEFAULT 1
551 static int record__mmap_flush_parse(const struct option *opt,
552                                     const char *str,
553                                     int unset)
554 {
555         int flush_max;
556         struct record_opts *opts = (struct record_opts *)opt->value;
557         static struct parse_tag tags[] = {
558                         { .tag  = 'B', .mult = 1       },
559                         { .tag  = 'K', .mult = 1 << 10 },
560                         { .tag  = 'M', .mult = 1 << 20 },
561                         { .tag  = 'G', .mult = 1 << 30 },
562                         { .tag  = 0 },
563         };
564
565         if (unset)
566                 return 0;
567
568         if (str) {
569                 opts->mmap_flush = parse_tag_value(str, tags);
570                 if (opts->mmap_flush == (int)-1)
571                         opts->mmap_flush = strtol(str, NULL, 0);
572         }
573
574         if (!opts->mmap_flush)
575                 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
576
577         flush_max = evlist__mmap_size(opts->mmap_pages);
578         flush_max /= 4;
579         if (opts->mmap_flush > flush_max)
580                 opts->mmap_flush = flush_max;
581
582         return 0;
583 }
584
585 #ifdef HAVE_ZSTD_SUPPORT
586 static unsigned int comp_level_default = 1;
587
588 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
589 {
590         struct record_opts *opts = opt->value;
591
592         if (unset) {
593                 opts->comp_level = 0;
594         } else {
595                 if (str)
596                         opts->comp_level = strtol(str, NULL, 0);
597                 if (!opts->comp_level)
598                         opts->comp_level = comp_level_default;
599         }
600
601         return 0;
602 }
603 #endif
604 static unsigned int comp_level_max = 22;
605
606 static int record__comp_enabled(struct record *rec)
607 {
608         return rec->opts.comp_level > 0;
609 }
610
611 static int process_synthesized_event(struct perf_tool *tool,
612                                      union perf_event *event,
613                                      struct perf_sample *sample __maybe_unused,
614                                      struct machine *machine __maybe_unused)
615 {
616         struct record *rec = container_of(tool, struct record, tool);
617         return record__write(rec, NULL, event, event->header.size);
618 }
619
620 static struct mutex synth_lock;
621
622 static int process_locked_synthesized_event(struct perf_tool *tool,
623                                      union perf_event *event,
624                                      struct perf_sample *sample __maybe_unused,
625                                      struct machine *machine __maybe_unused)
626 {
627         int ret;
628
629         mutex_lock(&synth_lock);
630         ret = process_synthesized_event(tool, event, sample, machine);
631         mutex_unlock(&synth_lock);
632         return ret;
633 }
634
635 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
636 {
637         struct record *rec = to;
638
639         if (record__comp_enabled(rec)) {
640                 ssize_t compressed = zstd_compress(rec->session, map, map->data,
641                                                    mmap__mmap_len(map), bf, size);
642
643                 if (compressed < 0)
644                         return (int)compressed;
645
646                 size = compressed;
647                 bf   = map->data;
648         }
649
650         thread->samples++;
651         return record__write(rec, map, bf, size);
652 }
653
654 static volatile sig_atomic_t signr = -1;
655 static volatile sig_atomic_t child_finished;
656 #ifdef HAVE_EVENTFD_SUPPORT
657 static volatile sig_atomic_t done_fd = -1;
658 #endif
659
660 static void sig_handler(int sig)
661 {
662         if (sig == SIGCHLD)
663                 child_finished = 1;
664         else
665                 signr = sig;
666
667         done = 1;
668 #ifdef HAVE_EVENTFD_SUPPORT
669         if (done_fd >= 0) {
670                 u64 tmp = 1;
671                 int orig_errno = errno;
672
673                 /*
674                  * It is possible for this signal handler to run after done is
675                  * checked in the main loop, but before the perf counter fds are
676                  * polled. If this happens, the poll() will continue to wait
677                  * even though done is set, and will only break out if either
678                  * another signal is received, or the counters are ready for
679                  * read. To ensure the poll() doesn't sleep when done is set,
680                  * use an eventfd (done_fd) to wake up the poll().
681                  */
682                 if (write(done_fd, &tmp, sizeof(tmp)) < 0)
683                         pr_err("failed to signal wakeup fd, error: %m\n");
684
685                 errno = orig_errno;
686         }
687 #endif // HAVE_EVENTFD_SUPPORT
688 }
689
690 static void sigsegv_handler(int sig)
691 {
692         perf_hooks__recover();
693         sighandler_dump_stack(sig);
694 }
695
696 static void record__sig_exit(void)
697 {
698         if (signr == -1)
699                 return;
700
701         signal(signr, SIG_DFL);
702         raise(signr);
703 }
704
705 #ifdef HAVE_AUXTRACE_SUPPORT
706
707 static int record__process_auxtrace(struct perf_tool *tool,
708                                     struct mmap *map,
709                                     union perf_event *event, void *data1,
710                                     size_t len1, void *data2, size_t len2)
711 {
712         struct record *rec = container_of(tool, struct record, tool);
713         struct perf_data *data = &rec->data;
714         size_t padding;
715         u8 pad[8] = {0};
716
717         if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
718                 off_t file_offset;
719                 int fd = perf_data__fd(data);
720                 int err;
721
722                 file_offset = lseek(fd, 0, SEEK_CUR);
723                 if (file_offset == -1)
724                         return -1;
725                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
726                                                      event, file_offset);
727                 if (err)
728                         return err;
729         }
730
731         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
732         padding = (len1 + len2) & 7;
733         if (padding)
734                 padding = 8 - padding;
735
736         record__write(rec, map, event, event->header.size);
737         record__write(rec, map, data1, len1);
738         if (len2)
739                 record__write(rec, map, data2, len2);
740         record__write(rec, map, &pad, padding);
741
742         return 0;
743 }
744
745 static int record__auxtrace_mmap_read(struct record *rec,
746                                       struct mmap *map)
747 {
748         int ret;
749
750         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
751                                   record__process_auxtrace);
752         if (ret < 0)
753                 return ret;
754
755         if (ret)
756                 rec->samples++;
757
758         return 0;
759 }
760
761 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
762                                                struct mmap *map)
763 {
764         int ret;
765
766         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
767                                            record__process_auxtrace,
768                                            rec->opts.auxtrace_snapshot_size);
769         if (ret < 0)
770                 return ret;
771
772         if (ret)
773                 rec->samples++;
774
775         return 0;
776 }
777
778 static int record__auxtrace_read_snapshot_all(struct record *rec)
779 {
780         int i;
781         int rc = 0;
782
783         for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
784                 struct mmap *map = &rec->evlist->mmap[i];
785
786                 if (!map->auxtrace_mmap.base)
787                         continue;
788
789                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
790                         rc = -1;
791                         goto out;
792                 }
793         }
794 out:
795         return rc;
796 }
797
798 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
799 {
800         pr_debug("Recording AUX area tracing snapshot\n");
801         if (record__auxtrace_read_snapshot_all(rec) < 0) {
802                 trigger_error(&auxtrace_snapshot_trigger);
803         } else {
804                 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
805                         trigger_error(&auxtrace_snapshot_trigger);
806                 else
807                         trigger_ready(&auxtrace_snapshot_trigger);
808         }
809 }
810
811 static int record__auxtrace_snapshot_exit(struct record *rec)
812 {
813         if (trigger_is_error(&auxtrace_snapshot_trigger))
814                 return 0;
815
816         if (!auxtrace_record__snapshot_started &&
817             auxtrace_record__snapshot_start(rec->itr))
818                 return -1;
819
820         record__read_auxtrace_snapshot(rec, true);
821         if (trigger_is_error(&auxtrace_snapshot_trigger))
822                 return -1;
823
824         return 0;
825 }
826
827 static int record__auxtrace_init(struct record *rec)
828 {
829         int err;
830
831         if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
832             && record__threads_enabled(rec)) {
833                 pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
834                 return -EINVAL;
835         }
836
837         if (!rec->itr) {
838                 rec->itr = auxtrace_record__init(rec->evlist, &err);
839                 if (err)
840                         return err;
841         }
842
843         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
844                                               rec->opts.auxtrace_snapshot_opts);
845         if (err)
846                 return err;
847
848         err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
849                                             rec->opts.auxtrace_sample_opts);
850         if (err)
851                 return err;
852
853         auxtrace_regroup_aux_output(rec->evlist);
854
855         return auxtrace_parse_filters(rec->evlist);
856 }
857
858 #else
859
860 static inline
861 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
862                                struct mmap *map __maybe_unused)
863 {
864         return 0;
865 }
866
867 static inline
868 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
869                                     bool on_exit __maybe_unused)
870 {
871 }
872
873 static inline
874 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
875 {
876         return 0;
877 }
878
879 static inline
880 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
881 {
882         return 0;
883 }
884
885 static int record__auxtrace_init(struct record *rec __maybe_unused)
886 {
887         return 0;
888 }
889
890 #endif
891
892 static int record__config_text_poke(struct evlist *evlist)
893 {
894         struct evsel *evsel;
895
896         /* Nothing to do if text poke is already configured */
897         evlist__for_each_entry(evlist, evsel) {
898                 if (evsel->core.attr.text_poke)
899                         return 0;
900         }
901
902         evsel = evlist__add_dummy_on_all_cpus(evlist);
903         if (!evsel)
904                 return -ENOMEM;
905
906         evsel->core.attr.text_poke = 1;
907         evsel->core.attr.ksymbol = 1;
908         evsel->immediate = true;
909         evsel__set_sample_bit(evsel, TIME);
910
911         return 0;
912 }
913
914 static int record__config_off_cpu(struct record *rec)
915 {
916         return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
917 }
918
919 static bool record__tracking_system_wide(struct record *rec)
920 {
921         struct evlist *evlist = rec->evlist;
922         struct evsel *evsel;
923
924         /*
925          * If non-dummy evsel exists, system_wide sideband is need to
926          * help parse sample information.
927          * For example, PERF_EVENT_MMAP event to help parse symbol,
928          * and PERF_EVENT_COMM event to help parse task executable name.
929          */
930         evlist__for_each_entry(evlist, evsel) {
931                 if (!evsel__is_dummy_event(evsel))
932                         return true;
933         }
934
935         return false;
936 }
937
938 static int record__config_tracking_events(struct record *rec)
939 {
940         struct record_opts *opts = &rec->opts;
941         struct evlist *evlist = rec->evlist;
942         bool system_wide = false;
943         struct evsel *evsel;
944
945         /*
946          * For initial_delay, system wide or a hybrid system, we need to add
947          * tracking event so that we can track PERF_RECORD_MMAP to cover the
948          * delay of waiting or event synthesis.
949          */
950         if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
951             perf_pmus__num_core_pmus() > 1) {
952
953                 /*
954                  * User space tasks can migrate between CPUs, so when tracing
955                  * selected CPUs, sideband for all CPUs is still needed.
956                  */
957                 if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
958                         system_wide = true;
959
960                 evsel = evlist__findnew_tracking_event(evlist, system_wide);
961                 if (!evsel)
962                         return -ENOMEM;
963
964                 /*
965                  * Enable the tracking event when the process is forked for
966                  * initial_delay, immediately for system wide.
967                  */
968                 if (opts->target.initial_delay && !evsel->immediate &&
969                     !target__has_cpu(&opts->target))
970                         evsel->core.attr.enable_on_exec = 1;
971                 else
972                         evsel->immediate = 1;
973         }
974
975         return 0;
976 }
977
978 static bool record__kcore_readable(struct machine *machine)
979 {
980         char kcore[PATH_MAX];
981         int fd;
982
983         scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
984
985         fd = open(kcore, O_RDONLY);
986         if (fd < 0)
987                 return false;
988
989         close(fd);
990
991         return true;
992 }
993
994 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
995 {
996         char from_dir[PATH_MAX];
997         char kcore_dir[PATH_MAX];
998         int ret;
999
1000         snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1001
1002         ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1003         if (ret)
1004                 return ret;
1005
1006         return kcore_copy(from_dir, kcore_dir);
1007 }
1008
1009 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1010 {
1011         thread_data->pipes.msg[0] = -1;
1012         thread_data->pipes.msg[1] = -1;
1013         thread_data->pipes.ack[0] = -1;
1014         thread_data->pipes.ack[1] = -1;
1015 }
1016
1017 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1018 {
1019         if (pipe(thread_data->pipes.msg))
1020                 return -EINVAL;
1021
1022         if (pipe(thread_data->pipes.ack)) {
1023                 close(thread_data->pipes.msg[0]);
1024                 thread_data->pipes.msg[0] = -1;
1025                 close(thread_data->pipes.msg[1]);
1026                 thread_data->pipes.msg[1] = -1;
1027                 return -EINVAL;
1028         }
1029
1030         pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1031                  thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1032                  thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1033
1034         return 0;
1035 }
1036
1037 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1038 {
1039         if (thread_data->pipes.msg[0] != -1) {
1040                 close(thread_data->pipes.msg[0]);
1041                 thread_data->pipes.msg[0] = -1;
1042         }
1043         if (thread_data->pipes.msg[1] != -1) {
1044                 close(thread_data->pipes.msg[1]);
1045                 thread_data->pipes.msg[1] = -1;
1046         }
1047         if (thread_data->pipes.ack[0] != -1) {
1048                 close(thread_data->pipes.ack[0]);
1049                 thread_data->pipes.ack[0] = -1;
1050         }
1051         if (thread_data->pipes.ack[1] != -1) {
1052                 close(thread_data->pipes.ack[1]);
1053                 thread_data->pipes.ack[1] = -1;
1054         }
1055 }
1056
1057 static bool evlist__per_thread(struct evlist *evlist)
1058 {
1059         return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1060 }
1061
1062 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1063 {
1064         int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1065         struct mmap *mmap = evlist->mmap;
1066         struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1067         struct perf_cpu_map *cpus = evlist->core.all_cpus;
1068         bool per_thread = evlist__per_thread(evlist);
1069
1070         if (per_thread)
1071                 thread_data->nr_mmaps = nr_mmaps;
1072         else
1073                 thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1074                                                       thread_data->mask->maps.nbits);
1075         if (mmap) {
1076                 thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1077                 if (!thread_data->maps)
1078                         return -ENOMEM;
1079         }
1080         if (overwrite_mmap) {
1081                 thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1082                 if (!thread_data->overwrite_maps) {
1083                         zfree(&thread_data->maps);
1084                         return -ENOMEM;
1085                 }
1086         }
1087         pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1088                  thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1089
1090         for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1091                 if (per_thread ||
1092                     test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1093                         if (thread_data->maps) {
1094                                 thread_data->maps[tm] = &mmap[m];
1095                                 pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1096                                           thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1097                         }
1098                         if (thread_data->overwrite_maps) {
1099                                 thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1100                                 pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1101                                           thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1102                         }
1103                         tm++;
1104                 }
1105         }
1106
1107         return 0;
1108 }
1109
1110 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1111 {
1112         int f, tm, pos;
1113         struct mmap *map, *overwrite_map;
1114
1115         fdarray__init(&thread_data->pollfd, 64);
1116
1117         for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1118                 map = thread_data->maps ? thread_data->maps[tm] : NULL;
1119                 overwrite_map = thread_data->overwrite_maps ?
1120                                 thread_data->overwrite_maps[tm] : NULL;
1121
1122                 for (f = 0; f < evlist->core.pollfd.nr; f++) {
1123                         void *ptr = evlist->core.pollfd.priv[f].ptr;
1124
1125                         if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1126                                 pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1127                                                               &evlist->core.pollfd);
1128                                 if (pos < 0)
1129                                         return pos;
1130                                 pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1131                                          thread_data, pos, evlist->core.pollfd.entries[f].fd);
1132                         }
1133                 }
1134         }
1135
1136         return 0;
1137 }
1138
1139 static void record__free_thread_data(struct record *rec)
1140 {
1141         int t;
1142         struct record_thread *thread_data = rec->thread_data;
1143
1144         if (thread_data == NULL)
1145                 return;
1146
1147         for (t = 0; t < rec->nr_threads; t++) {
1148                 record__thread_data_close_pipes(&thread_data[t]);
1149                 zfree(&thread_data[t].maps);
1150                 zfree(&thread_data[t].overwrite_maps);
1151                 fdarray__exit(&thread_data[t].pollfd);
1152         }
1153
1154         zfree(&rec->thread_data);
1155 }
1156
1157 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1158                                                     int evlist_pollfd_index,
1159                                                     int thread_pollfd_index)
1160 {
1161         size_t x = rec->index_map_cnt;
1162
1163         if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1164                 return -ENOMEM;
1165         rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1166         rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1167         rec->index_map_cnt += 1;
1168         return 0;
1169 }
1170
1171 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1172                                                     struct evlist *evlist,
1173                                                     struct record_thread *thread_data)
1174 {
1175         struct pollfd *e_entries = evlist->core.pollfd.entries;
1176         struct pollfd *t_entries = thread_data->pollfd.entries;
1177         int err = 0;
1178         size_t i;
1179
1180         for (i = 0; i < rec->index_map_cnt; i++) {
1181                 int e_pos = rec->index_map[i].evlist_pollfd_index;
1182                 int t_pos = rec->index_map[i].thread_pollfd_index;
1183
1184                 if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1185                     e_entries[e_pos].events != t_entries[t_pos].events) {
1186                         pr_err("Thread and evlist pollfd index mismatch\n");
1187                         err = -EINVAL;
1188                         continue;
1189                 }
1190                 e_entries[e_pos].revents = t_entries[t_pos].revents;
1191         }
1192         return err;
1193 }
1194
1195 static int record__dup_non_perf_events(struct record *rec,
1196                                        struct evlist *evlist,
1197                                        struct record_thread *thread_data)
1198 {
1199         struct fdarray *fda = &evlist->core.pollfd;
1200         int i, ret;
1201
1202         for (i = 0; i < fda->nr; i++) {
1203                 if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1204                         continue;
1205                 ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1206                 if (ret < 0) {
1207                         pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1208                         return ret;
1209                 }
1210                 pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1211                           thread_data, ret, fda->entries[i].fd);
1212                 ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1213                 if (ret < 0) {
1214                         pr_err("Failed to map thread and evlist pollfd indexes\n");
1215                         return ret;
1216                 }
1217         }
1218         return 0;
1219 }
1220
1221 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1222 {
1223         int t, ret;
1224         struct record_thread *thread_data;
1225
1226         rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1227         if (!rec->thread_data) {
1228                 pr_err("Failed to allocate thread data\n");
1229                 return -ENOMEM;
1230         }
1231         thread_data = rec->thread_data;
1232
1233         for (t = 0; t < rec->nr_threads; t++)
1234                 record__thread_data_init_pipes(&thread_data[t]);
1235
1236         for (t = 0; t < rec->nr_threads; t++) {
1237                 thread_data[t].rec = rec;
1238                 thread_data[t].mask = &rec->thread_masks[t];
1239                 ret = record__thread_data_init_maps(&thread_data[t], evlist);
1240                 if (ret) {
1241                         pr_err("Failed to initialize thread[%d] maps\n", t);
1242                         goto out_free;
1243                 }
1244                 ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1245                 if (ret) {
1246                         pr_err("Failed to initialize thread[%d] pollfd\n", t);
1247                         goto out_free;
1248                 }
1249                 if (t) {
1250                         thread_data[t].tid = -1;
1251                         ret = record__thread_data_open_pipes(&thread_data[t]);
1252                         if (ret) {
1253                                 pr_err("Failed to open thread[%d] communication pipes\n", t);
1254                                 goto out_free;
1255                         }
1256                         ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1257                                            POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1258                         if (ret < 0) {
1259                                 pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1260                                 goto out_free;
1261                         }
1262                         thread_data[t].ctlfd_pos = ret;
1263                         pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1264                                  thread_data, thread_data[t].ctlfd_pos,
1265                                  thread_data[t].pipes.msg[0]);
1266                 } else {
1267                         thread_data[t].tid = gettid();
1268
1269                         ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1270                         if (ret < 0)
1271                                 goto out_free;
1272
1273                         thread_data[t].ctlfd_pos = -1; /* Not used */
1274                 }
1275         }
1276
1277         return 0;
1278
1279 out_free:
1280         record__free_thread_data(rec);
1281
1282         return ret;
1283 }
1284
1285 static int record__mmap_evlist(struct record *rec,
1286                                struct evlist *evlist)
1287 {
1288         int i, ret;
1289         struct record_opts *opts = &rec->opts;
1290         bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1291                                   opts->auxtrace_sample_mode;
1292         char msg[512];
1293
1294         if (opts->affinity != PERF_AFFINITY_SYS)
1295                 cpu__setup_cpunode_map();
1296
1297         if (evlist__mmap_ex(evlist, opts->mmap_pages,
1298                                  opts->auxtrace_mmap_pages,
1299                                  auxtrace_overwrite,
1300                                  opts->nr_cblocks, opts->affinity,
1301                                  opts->mmap_flush, opts->comp_level) < 0) {
1302                 if (errno == EPERM) {
1303                         pr_err("Permission error mapping pages.\n"
1304                                "Consider increasing "
1305                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
1306                                "or try again with a smaller value of -m/--mmap_pages.\n"
1307                                "(current value: %u,%u)\n",
1308                                opts->mmap_pages, opts->auxtrace_mmap_pages);
1309                         return -errno;
1310                 } else {
1311                         pr_err("failed to mmap with %d (%s)\n", errno,
1312                                 str_error_r(errno, msg, sizeof(msg)));
1313                         if (errno)
1314                                 return -errno;
1315                         else
1316                                 return -EINVAL;
1317                 }
1318         }
1319
1320         if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1321                 return -1;
1322
1323         ret = record__alloc_thread_data(rec, evlist);
1324         if (ret)
1325                 return ret;
1326
1327         if (record__threads_enabled(rec)) {
1328                 ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1329                 if (ret) {
1330                         pr_err("Failed to create data directory: %s\n", strerror(-ret));
1331                         return ret;
1332                 }
1333                 for (i = 0; i < evlist->core.nr_mmaps; i++) {
1334                         if (evlist->mmap)
1335                                 evlist->mmap[i].file = &rec->data.dir.files[i];
1336                         if (evlist->overwrite_mmap)
1337                                 evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1338                 }
1339         }
1340
1341         return 0;
1342 }
1343
1344 static int record__mmap(struct record *rec)
1345 {
1346         return record__mmap_evlist(rec, rec->evlist);
1347 }
1348
1349 static int record__open(struct record *rec)
1350 {
1351         char msg[BUFSIZ];
1352         struct evsel *pos;
1353         struct evlist *evlist = rec->evlist;
1354         struct perf_session *session = rec->session;
1355         struct record_opts *opts = &rec->opts;
1356         int rc = 0;
1357
1358         evlist__config(evlist, opts, &callchain_param);
1359
1360         evlist__for_each_entry(evlist, pos) {
1361 try_again:
1362                 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1363                         if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1364                                 if (verbose > 0)
1365                                         ui__warning("%s\n", msg);
1366                                 goto try_again;
1367                         }
1368                         if ((errno == EINVAL || errno == EBADF) &&
1369                             pos->core.leader != &pos->core &&
1370                             pos->weak_group) {
1371                                 pos = evlist__reset_weak_group(evlist, pos, true);
1372                                 goto try_again;
1373                         }
1374                         rc = -errno;
1375                         evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1376                         ui__error("%s\n", msg);
1377                         goto out;
1378                 }
1379
1380                 pos->supported = true;
1381         }
1382
1383         if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1384                 pr_warning(
1385 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1386 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1387 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1388 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1389 "Samples in kernel modules won't be resolved at all.\n\n"
1390 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1391 "even with a suitable vmlinux or kallsyms file.\n\n");
1392         }
1393
1394         if (evlist__apply_filters(evlist, &pos)) {
1395                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1396                         pos->filter ?: "BPF", evsel__name(pos), errno,
1397                         str_error_r(errno, msg, sizeof(msg)));
1398                 rc = -1;
1399                 goto out;
1400         }
1401
1402         rc = record__mmap(rec);
1403         if (rc)
1404                 goto out;
1405
1406         session->evlist = evlist;
1407         perf_session__set_id_hdr_size(session);
1408 out:
1409         return rc;
1410 }
1411
1412 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1413 {
1414         if (rec->evlist->first_sample_time == 0)
1415                 rec->evlist->first_sample_time = sample_time;
1416
1417         if (sample_time)
1418                 rec->evlist->last_sample_time = sample_time;
1419 }
1420
1421 static int process_sample_event(struct perf_tool *tool,
1422                                 union perf_event *event,
1423                                 struct perf_sample *sample,
1424                                 struct evsel *evsel,
1425                                 struct machine *machine)
1426 {
1427         struct record *rec = container_of(tool, struct record, tool);
1428
1429         set_timestamp_boundary(rec, sample->time);
1430
1431         if (rec->buildid_all)
1432                 return 0;
1433
1434         rec->samples++;
1435         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1436 }
1437
1438 static int process_buildids(struct record *rec)
1439 {
1440         struct perf_session *session = rec->session;
1441
1442         if (perf_data__size(&rec->data) == 0)
1443                 return 0;
1444
1445         /*
1446          * During this process, it'll load kernel map and replace the
1447          * dso->long_name to a real pathname it found.  In this case
1448          * we prefer the vmlinux path like
1449          *   /lib/modules/3.16.4/build/vmlinux
1450          *
1451          * rather than build-id path (in debug directory).
1452          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1453          */
1454         symbol_conf.ignore_vmlinux_buildid = true;
1455
1456         /*
1457          * If --buildid-all is given, it marks all DSO regardless of hits,
1458          * so no need to process samples. But if timestamp_boundary is enabled,
1459          * it still needs to walk on all samples to get the timestamps of
1460          * first/last samples.
1461          */
1462         if (rec->buildid_all && !rec->timestamp_boundary)
1463                 rec->tool.sample = NULL;
1464
1465         return perf_session__process_events(session);
1466 }
1467
1468 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1469 {
1470         int err;
1471         struct perf_tool *tool = data;
1472         /*
1473          *As for guest kernel when processing subcommand record&report,
1474          *we arrange module mmap prior to guest kernel mmap and trigger
1475          *a preload dso because default guest module symbols are loaded
1476          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1477          *method is used to avoid symbol missing when the first addr is
1478          *in module instead of in guest kernel.
1479          */
1480         err = perf_event__synthesize_modules(tool, process_synthesized_event,
1481                                              machine);
1482         if (err < 0)
1483                 pr_err("Couldn't record guest kernel [%d]'s reference"
1484                        " relocation symbol.\n", machine->pid);
1485
1486         /*
1487          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1488          * have no _text sometimes.
1489          */
1490         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1491                                                  machine);
1492         if (err < 0)
1493                 pr_err("Couldn't record guest kernel [%d]'s reference"
1494                        " relocation symbol.\n", machine->pid);
1495 }
1496
1497 static struct perf_event_header finished_round_event = {
1498         .size = sizeof(struct perf_event_header),
1499         .type = PERF_RECORD_FINISHED_ROUND,
1500 };
1501
1502 static struct perf_event_header finished_init_event = {
1503         .size = sizeof(struct perf_event_header),
1504         .type = PERF_RECORD_FINISHED_INIT,
1505 };
1506
1507 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1508 {
1509         if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1510             !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1511                           thread->mask->affinity.nbits)) {
1512                 bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1513                 bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1514                           map->affinity_mask.bits, thread->mask->affinity.nbits);
1515                 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1516                                         (cpu_set_t *)thread->mask->affinity.bits);
1517                 if (verbose == 2) {
1518                         pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1519                         mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1520                 }
1521         }
1522 }
1523
1524 static size_t process_comp_header(void *record, size_t increment)
1525 {
1526         struct perf_record_compressed *event = record;
1527         size_t size = sizeof(*event);
1528
1529         if (increment) {
1530                 event->header.size += increment;
1531                 return increment;
1532         }
1533
1534         event->header.type = PERF_RECORD_COMPRESSED;
1535         event->header.size = size;
1536
1537         return size;
1538 }
1539
1540 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1541                             void *dst, size_t dst_size, void *src, size_t src_size)
1542 {
1543         ssize_t compressed;
1544         size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1545         struct zstd_data *zstd_data = &session->zstd_data;
1546
1547         if (map && map->file)
1548                 zstd_data = &map->zstd_data;
1549
1550         compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1551                                                      max_record_size, process_comp_header);
1552         if (compressed < 0)
1553                 return compressed;
1554
1555         if (map && map->file) {
1556                 thread->bytes_transferred += src_size;
1557                 thread->bytes_compressed  += compressed;
1558         } else {
1559                 session->bytes_transferred += src_size;
1560                 session->bytes_compressed  += compressed;
1561         }
1562
1563         return compressed;
1564 }
1565
1566 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1567                                     bool overwrite, bool synch)
1568 {
1569         u64 bytes_written = rec->bytes_written;
1570         int i;
1571         int rc = 0;
1572         int nr_mmaps;
1573         struct mmap **maps;
1574         int trace_fd = rec->data.file.fd;
1575         off_t off = 0;
1576
1577         if (!evlist)
1578                 return 0;
1579
1580         nr_mmaps = thread->nr_mmaps;
1581         maps = overwrite ? thread->overwrite_maps : thread->maps;
1582
1583         if (!maps)
1584                 return 0;
1585
1586         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1587                 return 0;
1588
1589         if (record__aio_enabled(rec))
1590                 off = record__aio_get_pos(trace_fd);
1591
1592         for (i = 0; i < nr_mmaps; i++) {
1593                 u64 flush = 0;
1594                 struct mmap *map = maps[i];
1595
1596                 if (map->core.base) {
1597                         record__adjust_affinity(rec, map);
1598                         if (synch) {
1599                                 flush = map->core.flush;
1600                                 map->core.flush = 1;
1601                         }
1602                         if (!record__aio_enabled(rec)) {
1603                                 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1604                                         if (synch)
1605                                                 map->core.flush = flush;
1606                                         rc = -1;
1607                                         goto out;
1608                                 }
1609                         } else {
1610                                 if (record__aio_push(rec, map, &off) < 0) {
1611                                         record__aio_set_pos(trace_fd, off);
1612                                         if (synch)
1613                                                 map->core.flush = flush;
1614                                         rc = -1;
1615                                         goto out;
1616                                 }
1617                         }
1618                         if (synch)
1619                                 map->core.flush = flush;
1620                 }
1621
1622                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1623                     !rec->opts.auxtrace_sample_mode &&
1624                     record__auxtrace_mmap_read(rec, map) != 0) {
1625                         rc = -1;
1626                         goto out;
1627                 }
1628         }
1629
1630         if (record__aio_enabled(rec))
1631                 record__aio_set_pos(trace_fd, off);
1632
1633         /*
1634          * Mark the round finished in case we wrote
1635          * at least one event.
1636          *
1637          * No need for round events in directory mode,
1638          * because per-cpu maps and files have data
1639          * sorted by kernel.
1640          */
1641         if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1642                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1643
1644         if (overwrite)
1645                 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1646 out:
1647         return rc;
1648 }
1649
1650 static int record__mmap_read_all(struct record *rec, bool synch)
1651 {
1652         int err;
1653
1654         err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1655         if (err)
1656                 return err;
1657
1658         return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1659 }
1660
1661 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1662                                            void *arg __maybe_unused)
1663 {
1664         struct perf_mmap *map = fda->priv[fd].ptr;
1665
1666         if (map)
1667                 perf_mmap__put(map);
1668 }
1669
1670 static void *record__thread(void *arg)
1671 {
1672         enum thread_msg msg = THREAD_MSG__READY;
1673         bool terminate = false;
1674         struct fdarray *pollfd;
1675         int err, ctlfd_pos;
1676
1677         thread = arg;
1678         thread->tid = gettid();
1679
1680         err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1681         if (err == -1)
1682                 pr_warning("threads[%d]: failed to notify on start: %s\n",
1683                            thread->tid, strerror(errno));
1684
1685         pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1686
1687         pollfd = &thread->pollfd;
1688         ctlfd_pos = thread->ctlfd_pos;
1689
1690         for (;;) {
1691                 unsigned long long hits = thread->samples;
1692
1693                 if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1694                         break;
1695
1696                 if (hits == thread->samples) {
1697
1698                         err = fdarray__poll(pollfd, -1);
1699                         /*
1700                          * Propagate error, only if there's any. Ignore positive
1701                          * number of returned events and interrupt error.
1702                          */
1703                         if (err > 0 || (err < 0 && errno == EINTR))
1704                                 err = 0;
1705                         thread->waking++;
1706
1707                         if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1708                                             record__thread_munmap_filtered, NULL) == 0)
1709                                 break;
1710                 }
1711
1712                 if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1713                         terminate = true;
1714                         close(thread->pipes.msg[0]);
1715                         thread->pipes.msg[0] = -1;
1716                         pollfd->entries[ctlfd_pos].fd = -1;
1717                         pollfd->entries[ctlfd_pos].events = 0;
1718                 }
1719
1720                 pollfd->entries[ctlfd_pos].revents = 0;
1721         }
1722         record__mmap_read_all(thread->rec, true);
1723
1724         err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1725         if (err == -1)
1726                 pr_warning("threads[%d]: failed to notify on termination: %s\n",
1727                            thread->tid, strerror(errno));
1728
1729         return NULL;
1730 }
1731
1732 static void record__init_features(struct record *rec)
1733 {
1734         struct perf_session *session = rec->session;
1735         int feat;
1736
1737         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1738                 perf_header__set_feat(&session->header, feat);
1739
1740         if (rec->no_buildid)
1741                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1742
1743 #ifdef HAVE_LIBTRACEEVENT
1744         if (!have_tracepoints(&rec->evlist->core.entries))
1745                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1746 #endif
1747
1748         if (!rec->opts.branch_stack)
1749                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1750
1751         if (!rec->opts.full_auxtrace)
1752                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1753
1754         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1755                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1756
1757         if (!rec->opts.use_clockid)
1758                 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1759
1760         if (!record__threads_enabled(rec))
1761                 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1762
1763         if (!record__comp_enabled(rec))
1764                 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1765
1766         perf_header__clear_feat(&session->header, HEADER_STAT);
1767 }
1768
1769 static void
1770 record__finish_output(struct record *rec)
1771 {
1772         int i;
1773         struct perf_data *data = &rec->data;
1774         int fd = perf_data__fd(data);
1775
1776         if (data->is_pipe) {
1777                 /* Just to display approx. size */
1778                 data->file.size = rec->bytes_written;
1779                 return;
1780         }
1781
1782         rec->session->header.data_size += rec->bytes_written;
1783         data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1784         if (record__threads_enabled(rec)) {
1785                 for (i = 0; i < data->dir.nr; i++)
1786                         data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1787         }
1788
1789         if (!rec->no_buildid) {
1790                 process_buildids(rec);
1791
1792                 if (rec->buildid_all)
1793                         dsos__hit_all(rec->session);
1794         }
1795         perf_session__write_header(rec->session, rec->evlist, fd, true);
1796
1797         return;
1798 }
1799
1800 static int record__synthesize_workload(struct record *rec, bool tail)
1801 {
1802         int err;
1803         struct perf_thread_map *thread_map;
1804         bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1805
1806         if (rec->opts.tail_synthesize != tail)
1807                 return 0;
1808
1809         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1810         if (thread_map == NULL)
1811                 return -1;
1812
1813         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1814                                                  process_synthesized_event,
1815                                                  &rec->session->machines.host,
1816                                                  needs_mmap,
1817                                                  rec->opts.sample_address);
1818         perf_thread_map__put(thread_map);
1819         return err;
1820 }
1821
1822 static int write_finished_init(struct record *rec, bool tail)
1823 {
1824         if (rec->opts.tail_synthesize != tail)
1825                 return 0;
1826
1827         return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1828 }
1829
1830 static int record__synthesize(struct record *rec, bool tail);
1831
1832 static int
1833 record__switch_output(struct record *rec, bool at_exit)
1834 {
1835         struct perf_data *data = &rec->data;
1836         char *new_filename = NULL;
1837         int fd, err;
1838
1839         /* Same Size:      "2015122520103046"*/
1840         char timestamp[] = "InvalidTimestamp";
1841
1842         record__aio_mmap_read_sync(rec);
1843
1844         write_finished_init(rec, true);
1845
1846         record__synthesize(rec, true);
1847         if (target__none(&rec->opts.target))
1848                 record__synthesize_workload(rec, true);
1849
1850         rec->samples = 0;
1851         record__finish_output(rec);
1852         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1853         if (err) {
1854                 pr_err("Failed to get current timestamp\n");
1855                 return -EINVAL;
1856         }
1857
1858         fd = perf_data__switch(data, timestamp,
1859                                rec->session->header.data_offset,
1860                                at_exit, &new_filename);
1861         if (fd >= 0 && !at_exit) {
1862                 rec->bytes_written = 0;
1863                 rec->session->header.data_size = 0;
1864         }
1865
1866         if (!quiet) {
1867                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1868                         data->path, timestamp);
1869         }
1870
1871         if (rec->switch_output.num_files) {
1872                 int n = rec->switch_output.cur_file + 1;
1873
1874                 if (n >= rec->switch_output.num_files)
1875                         n = 0;
1876                 rec->switch_output.cur_file = n;
1877                 if (rec->switch_output.filenames[n]) {
1878                         remove(rec->switch_output.filenames[n]);
1879                         zfree(&rec->switch_output.filenames[n]);
1880                 }
1881                 rec->switch_output.filenames[n] = new_filename;
1882         } else {
1883                 free(new_filename);
1884         }
1885
1886         /* Output tracking events */
1887         if (!at_exit) {
1888                 record__synthesize(rec, false);
1889
1890                 /*
1891                  * In 'perf record --switch-output' without -a,
1892                  * record__synthesize() in record__switch_output() won't
1893                  * generate tracking events because there's no thread_map
1894                  * in evlist. Which causes newly created perf.data doesn't
1895                  * contain map and comm information.
1896                  * Create a fake thread_map and directly call
1897                  * perf_event__synthesize_thread_map() for those events.
1898                  */
1899                 if (target__none(&rec->opts.target))
1900                         record__synthesize_workload(rec, false);
1901                 write_finished_init(rec, false);
1902         }
1903         return fd;
1904 }
1905
1906 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1907                                         struct perf_record_lost_samples *lost,
1908                                         int cpu_idx, int thread_idx, u64 lost_count,
1909                                         u16 misc_flag)
1910 {
1911         struct perf_sample_id *sid;
1912         struct perf_sample sample = {};
1913         int id_hdr_size;
1914
1915         lost->lost = lost_count;
1916         if (evsel->core.ids) {
1917                 sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1918                 sample.id = sid->id;
1919         }
1920
1921         id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1922                                                        evsel->core.attr.sample_type, &sample);
1923         lost->header.size = sizeof(*lost) + id_hdr_size;
1924         lost->header.misc = misc_flag;
1925         record__write(rec, NULL, lost, lost->header.size);
1926 }
1927
1928 static void record__read_lost_samples(struct record *rec)
1929 {
1930         struct perf_session *session = rec->session;
1931         struct perf_record_lost_samples *lost = NULL;
1932         struct evsel *evsel;
1933
1934         /* there was an error during record__open */
1935         if (session->evlist == NULL)
1936                 return;
1937
1938         evlist__for_each_entry(session->evlist, evsel) {
1939                 struct xyarray *xy = evsel->core.sample_id;
1940                 u64 lost_count;
1941
1942                 if (xy == NULL || evsel->core.fd == NULL)
1943                         continue;
1944                 if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1945                     xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1946                         pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1947                         continue;
1948                 }
1949
1950                 for (int x = 0; x < xyarray__max_x(xy); x++) {
1951                         for (int y = 0; y < xyarray__max_y(xy); y++) {
1952                                 struct perf_counts_values count;
1953
1954                                 if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1955                                         pr_debug("read LOST count failed\n");
1956                                         goto out;
1957                                 }
1958
1959                                 if (count.lost) {
1960                                         if (!lost) {
1961                                                 lost = zalloc(sizeof(*lost) +
1962                                                               session->machines.host.id_hdr_size);
1963                                                 if (!lost) {
1964                                                         pr_debug("Memory allocation failed\n");
1965                                                         return;
1966                                                 }
1967                                                 lost->header.type = PERF_RECORD_LOST_SAMPLES;
1968                                         }
1969                                         __record__save_lost_samples(rec, evsel, lost,
1970                                                                     x, y, count.lost, 0);
1971                                 }
1972                         }
1973                 }
1974
1975                 lost_count = perf_bpf_filter__lost_count(evsel);
1976                 if (lost_count) {
1977                         if (!lost) {
1978                                 lost = zalloc(sizeof(*lost) +
1979                                               session->machines.host.id_hdr_size);
1980                                 if (!lost) {
1981                                         pr_debug("Memory allocation failed\n");
1982                                         return;
1983                                 }
1984                                 lost->header.type = PERF_RECORD_LOST_SAMPLES;
1985                         }
1986                         __record__save_lost_samples(rec, evsel, lost, 0, 0, lost_count,
1987                                                     PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1988                 }
1989         }
1990 out:
1991         free(lost);
1992 }
1993
1994 static volatile sig_atomic_t workload_exec_errno;
1995
1996 /*
1997  * evlist__prepare_workload will send a SIGUSR1
1998  * if the fork fails, since we asked by setting its
1999  * want_signal to true.
2000  */
2001 static void workload_exec_failed_signal(int signo __maybe_unused,
2002                                         siginfo_t *info,
2003                                         void *ucontext __maybe_unused)
2004 {
2005         workload_exec_errno = info->si_value.sival_int;
2006         done = 1;
2007         child_finished = 1;
2008 }
2009
2010 static void snapshot_sig_handler(int sig);
2011 static void alarm_sig_handler(int sig);
2012
2013 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2014 {
2015         if (evlist) {
2016                 if (evlist->mmap && evlist->mmap[0].core.base)
2017                         return evlist->mmap[0].core.base;
2018                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2019                         return evlist->overwrite_mmap[0].core.base;
2020         }
2021         return NULL;
2022 }
2023
2024 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2025 {
2026         const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2027         if (pc)
2028                 return pc;
2029         return NULL;
2030 }
2031
2032 static int record__synthesize(struct record *rec, bool tail)
2033 {
2034         struct perf_session *session = rec->session;
2035         struct machine *machine = &session->machines.host;
2036         struct perf_data *data = &rec->data;
2037         struct record_opts *opts = &rec->opts;
2038         struct perf_tool *tool = &rec->tool;
2039         int err = 0;
2040         event_op f = process_synthesized_event;
2041
2042         if (rec->opts.tail_synthesize != tail)
2043                 return 0;
2044
2045         if (data->is_pipe) {
2046                 err = perf_event__synthesize_for_pipe(tool, session, data,
2047                                                       process_synthesized_event);
2048                 if (err < 0)
2049                         goto out;
2050
2051                 rec->bytes_written += err;
2052         }
2053
2054         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2055                                           process_synthesized_event, machine);
2056         if (err)
2057                 goto out;
2058
2059         /* Synthesize id_index before auxtrace_info */
2060         err = perf_event__synthesize_id_index(tool,
2061                                               process_synthesized_event,
2062                                               session->evlist, machine);
2063         if (err)
2064                 goto out;
2065
2066         if (rec->opts.full_auxtrace) {
2067                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2068                                         session, process_synthesized_event);
2069                 if (err)
2070                         goto out;
2071         }
2072
2073         if (!evlist__exclude_kernel(rec->evlist)) {
2074                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2075                                                          machine);
2076                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2077                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2078                                    "Check /proc/kallsyms permission or run as root.\n");
2079
2080                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
2081                                                      machine);
2082                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2083                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2084                                    "Check /proc/modules permission or run as root.\n");
2085         }
2086
2087         if (perf_guest) {
2088                 machines__process_guests(&session->machines,
2089                                          perf_event__synthesize_guest_os, tool);
2090         }
2091
2092         err = perf_event__synthesize_extra_attr(&rec->tool,
2093                                                 rec->evlist,
2094                                                 process_synthesized_event,
2095                                                 data->is_pipe);
2096         if (err)
2097                 goto out;
2098
2099         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2100                                                  process_synthesized_event,
2101                                                 NULL);
2102         if (err < 0) {
2103                 pr_err("Couldn't synthesize thread map.\n");
2104                 return err;
2105         }
2106
2107         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2108                                              process_synthesized_event, NULL);
2109         if (err < 0) {
2110                 pr_err("Couldn't synthesize cpu map.\n");
2111                 return err;
2112         }
2113
2114         err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2115                                                 machine, opts);
2116         if (err < 0) {
2117                 pr_warning("Couldn't synthesize bpf events.\n");
2118                 err = 0;
2119         }
2120
2121         if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2122                 err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2123                                                      machine);
2124                 if (err < 0) {
2125                         pr_warning("Couldn't synthesize cgroup events.\n");
2126                         err = 0;
2127                 }
2128         }
2129
2130         if (rec->opts.nr_threads_synthesize > 1) {
2131                 mutex_init(&synth_lock);
2132                 perf_set_multithreaded();
2133                 f = process_locked_synthesized_event;
2134         }
2135
2136         if (rec->opts.synth & PERF_SYNTH_TASK) {
2137                 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2138
2139                 err = __machine__synthesize_threads(machine, tool, &opts->target,
2140                                                     rec->evlist->core.threads,
2141                                                     f, needs_mmap, opts->sample_address,
2142                                                     rec->opts.nr_threads_synthesize);
2143         }
2144
2145         if (rec->opts.nr_threads_synthesize > 1) {
2146                 perf_set_singlethreaded();
2147                 mutex_destroy(&synth_lock);
2148         }
2149
2150 out:
2151         return err;
2152 }
2153
2154 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2155 {
2156         struct record *rec = data;
2157         pthread_kill(rec->thread_id, SIGUSR2);
2158         return 0;
2159 }
2160
2161 static int record__setup_sb_evlist(struct record *rec)
2162 {
2163         struct record_opts *opts = &rec->opts;
2164
2165         if (rec->sb_evlist != NULL) {
2166                 /*
2167                  * We get here if --switch-output-event populated the
2168                  * sb_evlist, so associate a callback that will send a SIGUSR2
2169                  * to the main thread.
2170                  */
2171                 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2172                 rec->thread_id = pthread_self();
2173         }
2174 #ifdef HAVE_LIBBPF_SUPPORT
2175         if (!opts->no_bpf_event) {
2176                 if (rec->sb_evlist == NULL) {
2177                         rec->sb_evlist = evlist__new();
2178
2179                         if (rec->sb_evlist == NULL) {
2180                                 pr_err("Couldn't create side band evlist.\n.");
2181                                 return -1;
2182                         }
2183                 }
2184
2185                 if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2186                         pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2187                         return -1;
2188                 }
2189         }
2190 #endif
2191         if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2192                 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2193                 opts->no_bpf_event = true;
2194         }
2195
2196         return 0;
2197 }
2198
2199 static int record__init_clock(struct record *rec)
2200 {
2201         struct perf_session *session = rec->session;
2202         struct timespec ref_clockid;
2203         struct timeval ref_tod;
2204         u64 ref;
2205
2206         if (!rec->opts.use_clockid)
2207                 return 0;
2208
2209         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2210                 session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2211
2212         session->header.env.clock.clockid = rec->opts.clockid;
2213
2214         if (gettimeofday(&ref_tod, NULL) != 0) {
2215                 pr_err("gettimeofday failed, cannot set reference time.\n");
2216                 return -1;
2217         }
2218
2219         if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2220                 pr_err("clock_gettime failed, cannot set reference time.\n");
2221                 return -1;
2222         }
2223
2224         ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2225               (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2226
2227         session->header.env.clock.tod_ns = ref;
2228
2229         ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2230               (u64) ref_clockid.tv_nsec;
2231
2232         session->header.env.clock.clockid_ns = ref;
2233         return 0;
2234 }
2235
2236 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2237 {
2238         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2239                 trigger_hit(&auxtrace_snapshot_trigger);
2240                 auxtrace_record__snapshot_started = 1;
2241                 if (auxtrace_record__snapshot_start(rec->itr))
2242                         trigger_error(&auxtrace_snapshot_trigger);
2243         }
2244 }
2245
2246 static int record__terminate_thread(struct record_thread *thread_data)
2247 {
2248         int err;
2249         enum thread_msg ack = THREAD_MSG__UNDEFINED;
2250         pid_t tid = thread_data->tid;
2251
2252         close(thread_data->pipes.msg[1]);
2253         thread_data->pipes.msg[1] = -1;
2254         err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2255         if (err > 0)
2256                 pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2257         else
2258                 pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2259                            thread->tid, tid);
2260
2261         return 0;
2262 }
2263
2264 static int record__start_threads(struct record *rec)
2265 {
2266         int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2267         struct record_thread *thread_data = rec->thread_data;
2268         sigset_t full, mask;
2269         pthread_t handle;
2270         pthread_attr_t attrs;
2271
2272         thread = &thread_data[0];
2273
2274         if (!record__threads_enabled(rec))
2275                 return 0;
2276
2277         sigfillset(&full);
2278         if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2279                 pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2280                 return -1;
2281         }
2282
2283         pthread_attr_init(&attrs);
2284         pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2285
2286         for (t = 1; t < nr_threads; t++) {
2287                 enum thread_msg msg = THREAD_MSG__UNDEFINED;
2288
2289 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2290                 pthread_attr_setaffinity_np(&attrs,
2291                                             MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2292                                             (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2293 #endif
2294                 if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2295                         for (tt = 1; tt < t; tt++)
2296                                 record__terminate_thread(&thread_data[t]);
2297                         pr_err("Failed to start threads: %s\n", strerror(errno));
2298                         ret = -1;
2299                         goto out_err;
2300                 }
2301
2302                 err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2303                 if (err > 0)
2304                         pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2305                                   thread_msg_tags[msg]);
2306                 else
2307                         pr_warning("threads[%d]: failed to receive start notification from %d\n",
2308                                    thread->tid, rec->thread_data[t].tid);
2309         }
2310
2311         sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2312                         (cpu_set_t *)thread->mask->affinity.bits);
2313
2314         pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2315
2316 out_err:
2317         pthread_attr_destroy(&attrs);
2318
2319         if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2320                 pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2321                 ret = -1;
2322         }
2323
2324         return ret;
2325 }
2326
2327 static int record__stop_threads(struct record *rec)
2328 {
2329         int t;
2330         struct record_thread *thread_data = rec->thread_data;
2331
2332         for (t = 1; t < rec->nr_threads; t++)
2333                 record__terminate_thread(&thread_data[t]);
2334
2335         for (t = 0; t < rec->nr_threads; t++) {
2336                 rec->samples += thread_data[t].samples;
2337                 if (!record__threads_enabled(rec))
2338                         continue;
2339                 rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2340                 rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2341                 pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2342                          thread_data[t].samples, thread_data[t].waking);
2343                 if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2344                         pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2345                                  thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2346                 else
2347                         pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2348         }
2349
2350         return 0;
2351 }
2352
2353 static unsigned long record__waking(struct record *rec)
2354 {
2355         int t;
2356         unsigned long waking = 0;
2357         struct record_thread *thread_data = rec->thread_data;
2358
2359         for (t = 0; t < rec->nr_threads; t++)
2360                 waking += thread_data[t].waking;
2361
2362         return waking;
2363 }
2364
2365 static int __cmd_record(struct record *rec, int argc, const char **argv)
2366 {
2367         int err;
2368         int status = 0;
2369         const bool forks = argc > 0;
2370         struct perf_tool *tool = &rec->tool;
2371         struct record_opts *opts = &rec->opts;
2372         struct perf_data *data = &rec->data;
2373         struct perf_session *session;
2374         bool disabled = false, draining = false;
2375         int fd;
2376         float ratio = 0;
2377         enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2378
2379         atexit(record__sig_exit);
2380         signal(SIGCHLD, sig_handler);
2381         signal(SIGINT, sig_handler);
2382         signal(SIGTERM, sig_handler);
2383         signal(SIGSEGV, sigsegv_handler);
2384
2385         if (rec->opts.record_namespaces)
2386                 tool->namespace_events = true;
2387
2388         if (rec->opts.record_cgroup) {
2389 #ifdef HAVE_FILE_HANDLE
2390                 tool->cgroup_events = true;
2391 #else
2392                 pr_err("cgroup tracking is not supported\n");
2393                 return -1;
2394 #endif
2395         }
2396
2397         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2398                 signal(SIGUSR2, snapshot_sig_handler);
2399                 if (rec->opts.auxtrace_snapshot_mode)
2400                         trigger_on(&auxtrace_snapshot_trigger);
2401                 if (rec->switch_output.enabled)
2402                         trigger_on(&switch_output_trigger);
2403         } else {
2404                 signal(SIGUSR2, SIG_IGN);
2405         }
2406
2407         session = perf_session__new(data, tool);
2408         if (IS_ERR(session)) {
2409                 pr_err("Perf session creation failed.\n");
2410                 return PTR_ERR(session);
2411         }
2412
2413         if (record__threads_enabled(rec)) {
2414                 if (perf_data__is_pipe(&rec->data)) {
2415                         pr_err("Parallel trace streaming is not available in pipe mode.\n");
2416                         return -1;
2417                 }
2418                 if (rec->opts.full_auxtrace) {
2419                         pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2420                         return -1;
2421                 }
2422         }
2423
2424         fd = perf_data__fd(data);
2425         rec->session = session;
2426
2427         if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2428                 pr_err("Compression initialization failed.\n");
2429                 return -1;
2430         }
2431 #ifdef HAVE_EVENTFD_SUPPORT
2432         done_fd = eventfd(0, EFD_NONBLOCK);
2433         if (done_fd < 0) {
2434                 pr_err("Failed to create wakeup eventfd, error: %m\n");
2435                 status = -1;
2436                 goto out_delete_session;
2437         }
2438         err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2439         if (err < 0) {
2440                 pr_err("Failed to add wakeup eventfd to poll list\n");
2441                 status = err;
2442                 goto out_delete_session;
2443         }
2444 #endif // HAVE_EVENTFD_SUPPORT
2445
2446         session->header.env.comp_type  = PERF_COMP_ZSTD;
2447         session->header.env.comp_level = rec->opts.comp_level;
2448
2449         if (rec->opts.kcore &&
2450             !record__kcore_readable(&session->machines.host)) {
2451                 pr_err("ERROR: kcore is not readable.\n");
2452                 return -1;
2453         }
2454
2455         if (record__init_clock(rec))
2456                 return -1;
2457
2458         record__init_features(rec);
2459
2460         if (forks) {
2461                 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2462                                                workload_exec_failed_signal);
2463                 if (err < 0) {
2464                         pr_err("Couldn't run the workload!\n");
2465                         status = err;
2466                         goto out_delete_session;
2467                 }
2468         }
2469
2470         /*
2471          * If we have just single event and are sending data
2472          * through pipe, we need to force the ids allocation,
2473          * because we synthesize event name through the pipe
2474          * and need the id for that.
2475          */
2476         if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2477                 rec->opts.sample_id = true;
2478
2479         if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2480                 rec->timestamp_filename = false;
2481                 pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2482         }
2483
2484         evlist__uniquify_name(rec->evlist);
2485
2486         /* Debug message used by test scripts */
2487         pr_debug3("perf record opening and mmapping events\n");
2488         if (record__open(rec) != 0) {
2489                 err = -1;
2490                 goto out_free_threads;
2491         }
2492         /* Debug message used by test scripts */
2493         pr_debug3("perf record done opening and mmapping events\n");
2494         session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2495
2496         if (rec->opts.kcore) {
2497                 err = record__kcore_copy(&session->machines.host, data);
2498                 if (err) {
2499                         pr_err("ERROR: Failed to copy kcore\n");
2500                         goto out_free_threads;
2501                 }
2502         }
2503
2504         /*
2505          * Normally perf_session__new would do this, but it doesn't have the
2506          * evlist.
2507          */
2508         if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2509                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2510                 rec->tool.ordered_events = false;
2511         }
2512
2513         if (evlist__nr_groups(rec->evlist) == 0)
2514                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2515
2516         if (data->is_pipe) {
2517                 err = perf_header__write_pipe(fd);
2518                 if (err < 0)
2519                         goto out_free_threads;
2520         } else {
2521                 err = perf_session__write_header(session, rec->evlist, fd, false);
2522                 if (err < 0)
2523                         goto out_free_threads;
2524         }
2525
2526         err = -1;
2527         if (!rec->no_buildid
2528             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2529                 pr_err("Couldn't generate buildids. "
2530                        "Use --no-buildid to profile anyway.\n");
2531                 goto out_free_threads;
2532         }
2533
2534         err = record__setup_sb_evlist(rec);
2535         if (err)
2536                 goto out_free_threads;
2537
2538         err = record__synthesize(rec, false);
2539         if (err < 0)
2540                 goto out_free_threads;
2541
2542         if (rec->realtime_prio) {
2543                 struct sched_param param;
2544
2545                 param.sched_priority = rec->realtime_prio;
2546                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2547                         pr_err("Could not set realtime priority.\n");
2548                         err = -1;
2549                         goto out_free_threads;
2550                 }
2551         }
2552
2553         if (record__start_threads(rec))
2554                 goto out_free_threads;
2555
2556         /*
2557          * When perf is starting the traced process, all the events
2558          * (apart from group members) have enable_on_exec=1 set,
2559          * so don't spoil it by prematurely enabling them.
2560          */
2561         if (!target__none(&opts->target) && !opts->target.initial_delay)
2562                 evlist__enable(rec->evlist);
2563
2564         /*
2565          * Let the child rip
2566          */
2567         if (forks) {
2568                 struct machine *machine = &session->machines.host;
2569                 union perf_event *event;
2570                 pid_t tgid;
2571
2572                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2573                 if (event == NULL) {
2574                         err = -ENOMEM;
2575                         goto out_child;
2576                 }
2577
2578                 /*
2579                  * Some H/W events are generated before COMM event
2580                  * which is emitted during exec(), so perf script
2581                  * cannot see a correct process name for those events.
2582                  * Synthesize COMM event to prevent it.
2583                  */
2584                 tgid = perf_event__synthesize_comm(tool, event,
2585                                                    rec->evlist->workload.pid,
2586                                                    process_synthesized_event,
2587                                                    machine);
2588                 free(event);
2589
2590                 if (tgid == -1)
2591                         goto out_child;
2592
2593                 event = malloc(sizeof(event->namespaces) +
2594                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2595                                machine->id_hdr_size);
2596                 if (event == NULL) {
2597                         err = -ENOMEM;
2598                         goto out_child;
2599                 }
2600
2601                 /*
2602                  * Synthesize NAMESPACES event for the command specified.
2603                  */
2604                 perf_event__synthesize_namespaces(tool, event,
2605                                                   rec->evlist->workload.pid,
2606                                                   tgid, process_synthesized_event,
2607                                                   machine);
2608                 free(event);
2609
2610                 evlist__start_workload(rec->evlist);
2611         }
2612
2613         if (opts->target.initial_delay) {
2614                 pr_info(EVLIST_DISABLED_MSG);
2615                 if (opts->target.initial_delay > 0) {
2616                         usleep(opts->target.initial_delay * USEC_PER_MSEC);
2617                         evlist__enable(rec->evlist);
2618                         pr_info(EVLIST_ENABLED_MSG);
2619                 }
2620         }
2621
2622         err = event_enable_timer__start(rec->evlist->eet);
2623         if (err)
2624                 goto out_child;
2625
2626         /* Debug message used by test scripts */
2627         pr_debug3("perf record has started\n");
2628         fflush(stderr);
2629
2630         trigger_ready(&auxtrace_snapshot_trigger);
2631         trigger_ready(&switch_output_trigger);
2632         perf_hooks__invoke_record_start();
2633
2634         /*
2635          * Must write FINISHED_INIT so it will be seen after all other
2636          * synthesized user events, but before any regular events.
2637          */
2638         err = write_finished_init(rec, false);
2639         if (err < 0)
2640                 goto out_child;
2641
2642         for (;;) {
2643                 unsigned long long hits = thread->samples;
2644
2645                 /*
2646                  * rec->evlist->bkw_mmap_state is possible to be
2647                  * BKW_MMAP_EMPTY here: when done == true and
2648                  * hits != rec->samples in previous round.
2649                  *
2650                  * evlist__toggle_bkw_mmap ensure we never
2651                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2652                  */
2653                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
2654                         evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2655
2656                 if (record__mmap_read_all(rec, false) < 0) {
2657                         trigger_error(&auxtrace_snapshot_trigger);
2658                         trigger_error(&switch_output_trigger);
2659                         err = -1;
2660                         goto out_child;
2661                 }
2662
2663                 if (auxtrace_record__snapshot_started) {
2664                         auxtrace_record__snapshot_started = 0;
2665                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
2666                                 record__read_auxtrace_snapshot(rec, false);
2667                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2668                                 pr_err("AUX area tracing snapshot failed\n");
2669                                 err = -1;
2670                                 goto out_child;
2671                         }
2672                 }
2673
2674                 if (trigger_is_hit(&switch_output_trigger)) {
2675                         /*
2676                          * If switch_output_trigger is hit, the data in
2677                          * overwritable ring buffer should have been collected,
2678                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2679                          *
2680                          * If SIGUSR2 raise after or during record__mmap_read_all(),
2681                          * record__mmap_read_all() didn't collect data from
2682                          * overwritable ring buffer. Read again.
2683                          */
2684                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2685                                 continue;
2686                         trigger_ready(&switch_output_trigger);
2687
2688                         /*
2689                          * Reenable events in overwrite ring buffer after
2690                          * record__mmap_read_all(): we should have collected
2691                          * data from it.
2692                          */
2693                         evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2694
2695                         if (!quiet)
2696                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2697                                         record__waking(rec));
2698                         thread->waking = 0;
2699                         fd = record__switch_output(rec, false);
2700                         if (fd < 0) {
2701                                 pr_err("Failed to switch to new file\n");
2702                                 trigger_error(&switch_output_trigger);
2703                                 err = fd;
2704                                 goto out_child;
2705                         }
2706
2707                         /* re-arm the alarm */
2708                         if (rec->switch_output.time)
2709                                 alarm(rec->switch_output.time);
2710                 }
2711
2712                 if (hits == thread->samples) {
2713                         if (done || draining)
2714                                 break;
2715                         err = fdarray__poll(&thread->pollfd, -1);
2716                         /*
2717                          * Propagate error, only if there's any. Ignore positive
2718                          * number of returned events and interrupt error.
2719                          */
2720                         if (err > 0 || (err < 0 && errno == EINTR))
2721                                 err = 0;
2722                         thread->waking++;
2723
2724                         if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2725                                             record__thread_munmap_filtered, NULL) == 0)
2726                                 draining = true;
2727
2728                         err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2729                         if (err)
2730                                 goto out_child;
2731                 }
2732
2733                 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2734                         switch (cmd) {
2735                         case EVLIST_CTL_CMD_SNAPSHOT:
2736                                 hit_auxtrace_snapshot_trigger(rec);
2737                                 evlist__ctlfd_ack(rec->evlist);
2738                                 break;
2739                         case EVLIST_CTL_CMD_STOP:
2740                                 done = 1;
2741                                 break;
2742                         case EVLIST_CTL_CMD_ACK:
2743                         case EVLIST_CTL_CMD_UNSUPPORTED:
2744                         case EVLIST_CTL_CMD_ENABLE:
2745                         case EVLIST_CTL_CMD_DISABLE:
2746                         case EVLIST_CTL_CMD_EVLIST:
2747                         case EVLIST_CTL_CMD_PING:
2748                         default:
2749                                 break;
2750                         }
2751                 }
2752
2753                 err = event_enable_timer__process(rec->evlist->eet);
2754                 if (err < 0)
2755                         goto out_child;
2756                 if (err) {
2757                         err = 0;
2758                         done = 1;
2759                 }
2760
2761                 /*
2762                  * When perf is starting the traced process, at the end events
2763                  * die with the process and we wait for that. Thus no need to
2764                  * disable events in this case.
2765                  */
2766                 if (done && !disabled && !target__none(&opts->target)) {
2767                         trigger_off(&auxtrace_snapshot_trigger);
2768                         evlist__disable(rec->evlist);
2769                         disabled = true;
2770                 }
2771         }
2772
2773         trigger_off(&auxtrace_snapshot_trigger);
2774         trigger_off(&switch_output_trigger);
2775
2776         if (opts->auxtrace_snapshot_on_exit)
2777                 record__auxtrace_snapshot_exit(rec);
2778
2779         if (forks && workload_exec_errno) {
2780                 char msg[STRERR_BUFSIZE], strevsels[2048];
2781                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2782
2783                 evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2784
2785                 pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2786                         strevsels, argv[0], emsg);
2787                 err = -1;
2788                 goto out_child;
2789         }
2790
2791         if (!quiet)
2792                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2793                         record__waking(rec));
2794
2795         write_finished_init(rec, true);
2796
2797         if (target__none(&rec->opts.target))
2798                 record__synthesize_workload(rec, true);
2799
2800 out_child:
2801         record__stop_threads(rec);
2802         record__mmap_read_all(rec, true);
2803 out_free_threads:
2804         record__free_thread_data(rec);
2805         evlist__finalize_ctlfd(rec->evlist);
2806         record__aio_mmap_read_sync(rec);
2807
2808         if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2809                 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2810                 session->header.env.comp_ratio = ratio + 0.5;
2811         }
2812
2813         if (forks) {
2814                 int exit_status;
2815
2816                 if (!child_finished)
2817                         kill(rec->evlist->workload.pid, SIGTERM);
2818
2819                 wait(&exit_status);
2820
2821                 if (err < 0)
2822                         status = err;
2823                 else if (WIFEXITED(exit_status))
2824                         status = WEXITSTATUS(exit_status);
2825                 else if (WIFSIGNALED(exit_status))
2826                         signr = WTERMSIG(exit_status);
2827         } else
2828                 status = err;
2829
2830         if (rec->off_cpu)
2831                 rec->bytes_written += off_cpu_write(rec->session);
2832
2833         record__read_lost_samples(rec);
2834         record__synthesize(rec, true);
2835         /* this will be recalculated during process_buildids() */
2836         rec->samples = 0;
2837
2838         if (!err) {
2839                 if (!rec->timestamp_filename) {
2840                         record__finish_output(rec);
2841                 } else {
2842                         fd = record__switch_output(rec, true);
2843                         if (fd < 0) {
2844                                 status = fd;
2845                                 goto out_delete_session;
2846                         }
2847                 }
2848         }
2849
2850         perf_hooks__invoke_record_end();
2851
2852         if (!err && !quiet) {
2853                 char samples[128];
2854                 const char *postfix = rec->timestamp_filename ?
2855                                         ".<timestamp>" : "";
2856
2857                 if (rec->samples && !rec->opts.full_auxtrace)
2858                         scnprintf(samples, sizeof(samples),
2859                                   " (%" PRIu64 " samples)", rec->samples);
2860                 else
2861                         samples[0] = '\0';
2862
2863                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
2864                         perf_data__size(data) / 1024.0 / 1024.0,
2865                         data->path, postfix, samples);
2866                 if (ratio) {
2867                         fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
2868                                         rec->session->bytes_transferred / 1024.0 / 1024.0,
2869                                         ratio);
2870                 }
2871                 fprintf(stderr, " ]\n");
2872         }
2873
2874 out_delete_session:
2875 #ifdef HAVE_EVENTFD_SUPPORT
2876         if (done_fd >= 0) {
2877                 fd = done_fd;
2878                 done_fd = -1;
2879
2880                 close(fd);
2881         }
2882 #endif
2883         zstd_fini(&session->zstd_data);
2884         perf_session__delete(session);
2885
2886         if (!opts->no_bpf_event)
2887                 evlist__stop_sb_thread(rec->sb_evlist);
2888         return status;
2889 }
2890
2891 static void callchain_debug(struct callchain_param *callchain)
2892 {
2893         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2894
2895         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2896
2897         if (callchain->record_mode == CALLCHAIN_DWARF)
2898                 pr_debug("callchain: stack dump size %d\n",
2899                          callchain->dump_size);
2900 }
2901
2902 int record_opts__parse_callchain(struct record_opts *record,
2903                                  struct callchain_param *callchain,
2904                                  const char *arg, bool unset)
2905 {
2906         int ret;
2907         callchain->enabled = !unset;
2908
2909         /* --no-call-graph */
2910         if (unset) {
2911                 callchain->record_mode = CALLCHAIN_NONE;
2912                 pr_debug("callchain: disabled\n");
2913                 return 0;
2914         }
2915
2916         ret = parse_callchain_record_opt(arg, callchain);
2917         if (!ret) {
2918                 /* Enable data address sampling for DWARF unwind. */
2919                 if (callchain->record_mode == CALLCHAIN_DWARF)
2920                         record->sample_address = true;
2921                 callchain_debug(callchain);
2922         }
2923
2924         return ret;
2925 }
2926
2927 int record_parse_callchain_opt(const struct option *opt,
2928                                const char *arg,
2929                                int unset)
2930 {
2931         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2932 }
2933
2934 int record_callchain_opt(const struct option *opt,
2935                          const char *arg __maybe_unused,
2936                          int unset __maybe_unused)
2937 {
2938         struct callchain_param *callchain = opt->value;
2939
2940         callchain->enabled = true;
2941
2942         if (callchain->record_mode == CALLCHAIN_NONE)
2943                 callchain->record_mode = CALLCHAIN_FP;
2944
2945         callchain_debug(callchain);
2946         return 0;
2947 }
2948
2949 static int perf_record_config(const char *var, const char *value, void *cb)
2950 {
2951         struct record *rec = cb;
2952
2953         if (!strcmp(var, "record.build-id")) {
2954                 if (!strcmp(value, "cache"))
2955                         rec->no_buildid_cache = false;
2956                 else if (!strcmp(value, "no-cache"))
2957                         rec->no_buildid_cache = true;
2958                 else if (!strcmp(value, "skip"))
2959                         rec->no_buildid = true;
2960                 else if (!strcmp(value, "mmap"))
2961                         rec->buildid_mmap = true;
2962                 else
2963                         return -1;
2964                 return 0;
2965         }
2966         if (!strcmp(var, "record.call-graph")) {
2967                 var = "call-graph.record-mode";
2968                 return perf_default_config(var, value, cb);
2969         }
2970 #ifdef HAVE_AIO_SUPPORT
2971         if (!strcmp(var, "record.aio")) {
2972                 rec->opts.nr_cblocks = strtol(value, NULL, 0);
2973                 if (!rec->opts.nr_cblocks)
2974                         rec->opts.nr_cblocks = nr_cblocks_default;
2975         }
2976 #endif
2977         if (!strcmp(var, "record.debuginfod")) {
2978                 rec->debuginfod.urls = strdup(value);
2979                 if (!rec->debuginfod.urls)
2980                         return -ENOMEM;
2981                 rec->debuginfod.set = true;
2982         }
2983
2984         return 0;
2985 }
2986
2987 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2988 {
2989         struct record *rec = (struct record *)opt->value;
2990
2991         return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2992 }
2993
2994 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2995 {
2996         struct record_opts *opts = (struct record_opts *)opt->value;
2997
2998         if (unset || !str)
2999                 return 0;
3000
3001         if (!strcasecmp(str, "node"))
3002                 opts->affinity = PERF_AFFINITY_NODE;
3003         else if (!strcasecmp(str, "cpu"))
3004                 opts->affinity = PERF_AFFINITY_CPU;
3005
3006         return 0;
3007 }
3008
3009 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3010 {
3011         mask->nbits = nr_bits;
3012         mask->bits = bitmap_zalloc(mask->nbits);
3013         if (!mask->bits)
3014                 return -ENOMEM;
3015
3016         return 0;
3017 }
3018
3019 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3020 {
3021         bitmap_free(mask->bits);
3022         mask->nbits = 0;
3023 }
3024
3025 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3026 {
3027         int ret;
3028
3029         ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3030         if (ret) {
3031                 mask->affinity.bits = NULL;
3032                 return ret;
3033         }
3034
3035         ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3036         if (ret) {
3037                 record__mmap_cpu_mask_free(&mask->maps);
3038                 mask->maps.bits = NULL;
3039         }
3040
3041         return ret;
3042 }
3043
3044 static void record__thread_mask_free(struct thread_mask *mask)
3045 {
3046         record__mmap_cpu_mask_free(&mask->maps);
3047         record__mmap_cpu_mask_free(&mask->affinity);
3048 }
3049
3050 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3051 {
3052         int s;
3053         struct record_opts *opts = opt->value;
3054
3055         if (unset || !str || !strlen(str)) {
3056                 opts->threads_spec = THREAD_SPEC__CPU;
3057         } else {
3058                 for (s = 1; s < THREAD_SPEC__MAX; s++) {
3059                         if (s == THREAD_SPEC__USER) {
3060                                 opts->threads_user_spec = strdup(str);
3061                                 if (!opts->threads_user_spec)
3062                                         return -ENOMEM;
3063                                 opts->threads_spec = THREAD_SPEC__USER;
3064                                 break;
3065                         }
3066                         if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3067                                 opts->threads_spec = s;
3068                                 break;
3069                         }
3070                 }
3071         }
3072
3073         if (opts->threads_spec == THREAD_SPEC__USER)
3074                 pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3075         else
3076                 pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3077
3078         return 0;
3079 }
3080
3081 static int parse_output_max_size(const struct option *opt,
3082                                  const char *str, int unset)
3083 {
3084         unsigned long *s = (unsigned long *)opt->value;
3085         static struct parse_tag tags_size[] = {
3086                 { .tag  = 'B', .mult = 1       },
3087                 { .tag  = 'K', .mult = 1 << 10 },
3088                 { .tag  = 'M', .mult = 1 << 20 },
3089                 { .tag  = 'G', .mult = 1 << 30 },
3090                 { .tag  = 0 },
3091         };
3092         unsigned long val;
3093
3094         if (unset) {
3095                 *s = 0;
3096                 return 0;
3097         }
3098
3099         val = parse_tag_value(str, tags_size);
3100         if (val != (unsigned long) -1) {
3101                 *s = val;
3102                 return 0;
3103         }
3104
3105         return -1;
3106 }
3107
3108 static int record__parse_mmap_pages(const struct option *opt,
3109                                     const char *str,
3110                                     int unset __maybe_unused)
3111 {
3112         struct record_opts *opts = opt->value;
3113         char *s, *p;
3114         unsigned int mmap_pages;
3115         int ret;
3116
3117         if (!str)
3118                 return -EINVAL;
3119
3120         s = strdup(str);
3121         if (!s)
3122                 return -ENOMEM;
3123
3124         p = strchr(s, ',');
3125         if (p)
3126                 *p = '\0';
3127
3128         if (*s) {
3129                 ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3130                 if (ret)
3131                         goto out_free;
3132                 opts->mmap_pages = mmap_pages;
3133         }
3134
3135         if (!p) {
3136                 ret = 0;
3137                 goto out_free;
3138         }
3139
3140         ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3141         if (ret)
3142                 goto out_free;
3143
3144         opts->auxtrace_mmap_pages = mmap_pages;
3145
3146 out_free:
3147         free(s);
3148         return ret;
3149 }
3150
3151 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3152 {
3153 }
3154
3155 static int parse_control_option(const struct option *opt,
3156                                 const char *str,
3157                                 int unset __maybe_unused)
3158 {
3159         struct record_opts *opts = opt->value;
3160
3161         return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3162 }
3163
3164 static void switch_output_size_warn(struct record *rec)
3165 {
3166         u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3167         struct switch_output *s = &rec->switch_output;
3168
3169         wakeup_size /= 2;
3170
3171         if (s->size < wakeup_size) {
3172                 char buf[100];
3173
3174                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3175                 pr_warning("WARNING: switch-output data size lower than "
3176                            "wakeup kernel buffer size (%s) "
3177                            "expect bigger perf.data sizes\n", buf);
3178         }
3179 }
3180
3181 static int switch_output_setup(struct record *rec)
3182 {
3183         struct switch_output *s = &rec->switch_output;
3184         static struct parse_tag tags_size[] = {
3185                 { .tag  = 'B', .mult = 1       },
3186                 { .tag  = 'K', .mult = 1 << 10 },
3187                 { .tag  = 'M', .mult = 1 << 20 },
3188                 { .tag  = 'G', .mult = 1 << 30 },
3189                 { .tag  = 0 },
3190         };
3191         static struct parse_tag tags_time[] = {
3192                 { .tag  = 's', .mult = 1        },
3193                 { .tag  = 'm', .mult = 60       },
3194                 { .tag  = 'h', .mult = 60*60    },
3195                 { .tag  = 'd', .mult = 60*60*24 },
3196                 { .tag  = 0 },
3197         };
3198         unsigned long val;
3199
3200         /*
3201          * If we're using --switch-output-events, then we imply its 
3202          * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3203          *  thread to its parent.
3204          */
3205         if (rec->switch_output_event_set) {
3206                 if (record__threads_enabled(rec)) {
3207                         pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3208                         return 0;
3209                 }
3210                 goto do_signal;
3211         }
3212
3213         if (!s->set)
3214                 return 0;
3215
3216         if (record__threads_enabled(rec)) {
3217                 pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3218                 return 0;
3219         }
3220
3221         if (!strcmp(s->str, "signal")) {
3222 do_signal:
3223                 s->signal = true;
3224                 pr_debug("switch-output with SIGUSR2 signal\n");
3225                 goto enabled;
3226         }
3227
3228         val = parse_tag_value(s->str, tags_size);
3229         if (val != (unsigned long) -1) {
3230                 s->size = val;
3231                 pr_debug("switch-output with %s size threshold\n", s->str);
3232                 goto enabled;
3233         }
3234
3235         val = parse_tag_value(s->str, tags_time);
3236         if (val != (unsigned long) -1) {
3237                 s->time = val;
3238                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3239                          s->str, s->time);
3240                 goto enabled;
3241         }
3242
3243         return -1;
3244
3245 enabled:
3246         rec->timestamp_filename = true;
3247         s->enabled              = true;
3248
3249         if (s->size && !rec->opts.no_buffering)
3250                 switch_output_size_warn(rec);
3251
3252         return 0;
3253 }
3254
3255 static const char * const __record_usage[] = {
3256         "perf record [<options>] [<command>]",
3257         "perf record [<options>] -- <command> [<options>]",
3258         NULL
3259 };
3260 const char * const *record_usage = __record_usage;
3261
3262 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3263                                   struct perf_sample *sample, struct machine *machine)
3264 {
3265         /*
3266          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3267          * no need to add them twice.
3268          */
3269         if (!(event->header.misc & PERF_RECORD_MISC_USER))
3270                 return 0;
3271         return perf_event__process_mmap(tool, event, sample, machine);
3272 }
3273
3274 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3275                                    struct perf_sample *sample, struct machine *machine)
3276 {
3277         /*
3278          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3279          * no need to add them twice.
3280          */
3281         if (!(event->header.misc & PERF_RECORD_MISC_USER))
3282                 return 0;
3283
3284         return perf_event__process_mmap2(tool, event, sample, machine);
3285 }
3286
3287 static int process_timestamp_boundary(struct perf_tool *tool,
3288                                       union perf_event *event __maybe_unused,
3289                                       struct perf_sample *sample,
3290                                       struct machine *machine __maybe_unused)
3291 {
3292         struct record *rec = container_of(tool, struct record, tool);
3293
3294         set_timestamp_boundary(rec, sample->time);
3295         return 0;
3296 }
3297
3298 static int parse_record_synth_option(const struct option *opt,
3299                                      const char *str,
3300                                      int unset __maybe_unused)
3301 {
3302         struct record_opts *opts = opt->value;
3303         char *p = strdup(str);
3304
3305         if (p == NULL)
3306                 return -1;
3307
3308         opts->synth = parse_synth_opt(p);
3309         free(p);
3310
3311         if (opts->synth < 0) {
3312                 pr_err("Invalid synth option: %s\n", str);
3313                 return -1;
3314         }
3315         return 0;
3316 }
3317
3318 /*
3319  * XXX Ideally would be local to cmd_record() and passed to a record__new
3320  * because we need to have access to it in record__exit, that is called
3321  * after cmd_record() exits, but since record_options need to be accessible to
3322  * builtin-script, leave it here.
3323  *
3324  * At least we don't ouch it in all the other functions here directly.
3325  *
3326  * Just say no to tons of global variables, sigh.
3327  */
3328 static struct record record = {
3329         .opts = {
3330                 .sample_time         = true,
3331                 .mmap_pages          = UINT_MAX,
3332                 .user_freq           = UINT_MAX,
3333                 .user_interval       = ULLONG_MAX,
3334                 .freq                = 4000,
3335                 .target              = {
3336                         .uses_mmap   = true,
3337                         .default_per_cpu = true,
3338                 },
3339                 .mmap_flush          = MMAP_FLUSH_DEFAULT,
3340                 .nr_threads_synthesize = 1,
3341                 .ctl_fd              = -1,
3342                 .ctl_fd_ack          = -1,
3343                 .synth               = PERF_SYNTH_ALL,
3344         },
3345         .tool = {
3346                 .sample         = process_sample_event,
3347                 .fork           = perf_event__process_fork,
3348                 .exit           = perf_event__process_exit,
3349                 .comm           = perf_event__process_comm,
3350                 .namespaces     = perf_event__process_namespaces,
3351                 .mmap           = build_id__process_mmap,
3352                 .mmap2          = build_id__process_mmap2,
3353                 .itrace_start   = process_timestamp_boundary,
3354                 .aux            = process_timestamp_boundary,
3355                 .ordered_events = true,
3356         },
3357 };
3358
3359 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3360         "\n\t\t\t\tDefault: fp";
3361
3362 static bool dry_run;
3363
3364 static struct parse_events_option_args parse_events_option_args = {
3365         .evlistp = &record.evlist,
3366 };
3367
3368 static struct parse_events_option_args switch_output_parse_events_option_args = {
3369         .evlistp = &record.sb_evlist,
3370 };
3371
3372 /*
3373  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3374  * with it and switch to use the library functions in perf_evlist that came
3375  * from builtin-record.c, i.e. use record_opts,
3376  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3377  * using pipes, etc.
3378  */
3379 static struct option __record_options[] = {
3380         OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3381                      "event selector. use 'perf list' to list available events",
3382                      parse_events_option),
3383         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3384                      "event filter", parse_filter),
3385         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3386                            NULL, "don't record events from perf itself",
3387                            exclude_perf),
3388         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3389                     "record events on existing process id"),
3390         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3391                     "record events on existing thread id"),
3392         OPT_INTEGER('r', "realtime", &record.realtime_prio,
3393                     "collect data with this RT SCHED_FIFO priority"),
3394         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3395                     "collect data without buffering"),
3396         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3397                     "collect raw sample records from all opened counters"),
3398         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3399                             "system-wide collection from all CPUs"),
3400         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3401                     "list of cpus to monitor"),
3402         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3403         OPT_STRING('o', "output", &record.data.path, "file",
3404                     "output file name"),
3405         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3406                         &record.opts.no_inherit_set,
3407                         "child tasks do not inherit counters"),
3408         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3409                     "synthesize non-sample events at the end of output"),
3410         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3411         OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3412         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3413                     "Fail if the specified frequency can't be used"),
3414         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3415                      "profile at this frequency",
3416                       record__parse_freq),
3417         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3418                      "number of mmap data pages and AUX area tracing mmap pages",
3419                      record__parse_mmap_pages),
3420         OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3421                      "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3422                      record__mmap_flush_parse),
3423         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3424                            NULL, "enables call-graph recording" ,
3425                            &record_callchain_opt),
3426         OPT_CALLBACK(0, "call-graph", &record.opts,
3427                      "record_mode[,record_size]", record_callchain_help,
3428                      &record_parse_callchain_opt),
3429         OPT_INCR('v', "verbose", &verbose,
3430                     "be more verbose (show counter open errors, etc)"),
3431         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3432         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3433                     "per thread counts"),
3434         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3435         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3436                     "Record the sample physical addresses"),
3437         OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3438                     "Record the sampled data address data page size"),
3439         OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3440                     "Record the sampled code address (ip) page size"),
3441         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3442         OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3443                     "Record the sample identifier"),
3444         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3445                         &record.opts.sample_time_set,
3446                         "Record the sample timestamps"),
3447         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3448                         "Record the sample period"),
3449         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3450                     "don't sample"),
3451         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3452                         &record.no_buildid_cache_set,
3453                         "do not update the buildid cache"),
3454         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3455                         &record.no_buildid_set,
3456                         "do not collect buildids in perf.data"),
3457         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3458                      "monitor event in cgroup name only",
3459                      parse_cgroups),
3460         OPT_CALLBACK('D', "delay", &record, "ms",
3461                      "ms to wait before starting measurement after program start (-1: start with events disabled), "
3462                      "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3463                      record__parse_event_enable_time),
3464         OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3465         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3466                    "user to profile"),
3467
3468         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3469                      "branch any", "sample any taken branches",
3470                      parse_branch_stack),
3471
3472         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3473                      "branch filter mask", "branch stack filter modes",
3474                      parse_branch_stack),
3475         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3476                     "sample by weight (on special events only)"),
3477         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3478                     "sample transaction flags (special events only)"),
3479         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3480                     "use per-thread mmaps"),
3481         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3482                     "sample selected machine registers on interrupt,"
3483                     " use '-I?' to list register names", parse_intr_regs),
3484         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3485                     "sample selected machine registers on interrupt,"
3486                     " use '--user-regs=?' to list register names", parse_user_regs),
3487         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3488                     "Record running/enabled time of read (:S) events"),
3489         OPT_CALLBACK('k', "clockid", &record.opts,
3490         "clockid", "clockid to use for events, see clock_gettime()",
3491         parse_clockid),
3492         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3493                           "opts", "AUX area tracing Snapshot Mode", ""),
3494         OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3495                           "opts", "sample AUX area", ""),
3496         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3497                         "per thread proc mmap processing timeout in ms"),
3498         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3499                     "Record namespaces events"),
3500         OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3501                     "Record cgroup events"),
3502         OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3503                         &record.opts.record_switch_events_set,
3504                         "Record context switch events"),
3505         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3506                          "Configure all used events to run in kernel space.",
3507                          PARSE_OPT_EXCLUSIVE),
3508         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3509                          "Configure all used events to run in user space.",
3510                          PARSE_OPT_EXCLUSIVE),
3511         OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3512                     "collect kernel callchains"),
3513         OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3514                     "collect user callchains"),
3515         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3516                    "file", "vmlinux pathname"),
3517         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3518                     "Record build-id of all DSOs regardless of hits"),
3519         OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3520                     "Record build-id in map events"),
3521         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3522                     "append timestamp to output filename"),
3523         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3524                     "Record timestamp boundary (time of first/last samples)"),
3525         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3526                           &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3527                           "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3528                           "signal"),
3529         OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3530                          &record.switch_output_event_set, "switch output event",
3531                          "switch output event selector. use 'perf list' to list available events",
3532                          parse_events_option_new_evlist),
3533         OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3534                    "Limit number of switch output generated files"),
3535         OPT_BOOLEAN(0, "dry-run", &dry_run,
3536                     "Parse options then exit"),
3537 #ifdef HAVE_AIO_SUPPORT
3538         OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3539                      &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3540                      record__aio_parse),
3541 #endif
3542         OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3543                      "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3544                      record__parse_affinity),
3545 #ifdef HAVE_ZSTD_SUPPORT
3546         OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3547                             "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3548                             record__parse_comp_level),
3549 #endif
3550         OPT_CALLBACK(0, "max-size", &record.output_max_size,
3551                      "size", "Limit the maximum size of the output file", parse_output_max_size),
3552         OPT_UINTEGER(0, "num-thread-synthesize",
3553                      &record.opts.nr_threads_synthesize,
3554                      "number of threads to run for event synthesis"),
3555 #ifdef HAVE_LIBPFM
3556         OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3557                 "libpfm4 event selector. use 'perf list' to list available events",
3558                 parse_libpfm_events_option),
3559 #endif
3560         OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3561                      "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3562                      "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3563                      "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3564                      "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3565                       parse_control_option),
3566         OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3567                      "Fine-tune event synthesis: default=all", parse_record_synth_option),
3568         OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3569                           &record.debuginfod.set, "debuginfod urls",
3570                           "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3571                           "system"),
3572         OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3573                             "write collected trace data into several data files using parallel threads",
3574                             record__parse_threads),
3575         OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3576         OPT_END()
3577 };
3578
3579 struct option *record_options = __record_options;
3580
3581 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3582 {
3583         struct perf_cpu cpu;
3584         int idx;
3585
3586         if (cpu_map__is_dummy(cpus))
3587                 return 0;
3588
3589         perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3590                 /* Return ENODEV is input cpu is greater than max cpu */
3591                 if ((unsigned long)cpu.cpu > mask->nbits)
3592                         return -ENODEV;
3593                 __set_bit(cpu.cpu, mask->bits);
3594         }
3595
3596         return 0;
3597 }
3598
3599 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3600 {
3601         struct perf_cpu_map *cpus;
3602
3603         cpus = perf_cpu_map__new(mask_spec);
3604         if (!cpus)
3605                 return -ENOMEM;
3606
3607         bitmap_zero(mask->bits, mask->nbits);
3608         if (record__mmap_cpu_mask_init(mask, cpus))
3609                 return -ENODEV;
3610
3611         perf_cpu_map__put(cpus);
3612
3613         return 0;
3614 }
3615
3616 static void record__free_thread_masks(struct record *rec, int nr_threads)
3617 {
3618         int t;
3619
3620         if (rec->thread_masks)
3621                 for (t = 0; t < nr_threads; t++)
3622                         record__thread_mask_free(&rec->thread_masks[t]);
3623
3624         zfree(&rec->thread_masks);
3625 }
3626
3627 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3628 {
3629         int t, ret;
3630
3631         rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3632         if (!rec->thread_masks) {
3633                 pr_err("Failed to allocate thread masks\n");
3634                 return -ENOMEM;
3635         }
3636
3637         for (t = 0; t < nr_threads; t++) {
3638                 ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3639                 if (ret) {
3640                         pr_err("Failed to allocate thread masks[%d]\n", t);
3641                         goto out_free;
3642                 }
3643         }
3644
3645         return 0;
3646
3647 out_free:
3648         record__free_thread_masks(rec, nr_threads);
3649
3650         return ret;
3651 }
3652
3653 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3654 {
3655         int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3656
3657         ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3658         if (ret)
3659                 return ret;
3660
3661         rec->nr_threads = nr_cpus;
3662         pr_debug("nr_threads: %d\n", rec->nr_threads);
3663
3664         for (t = 0; t < rec->nr_threads; t++) {
3665                 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3666                 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3667                 if (verbose > 0) {
3668                         pr_debug("thread_masks[%d]: ", t);
3669                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3670                         pr_debug("thread_masks[%d]: ", t);
3671                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3672                 }
3673         }
3674
3675         return 0;
3676 }
3677
3678 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3679                                           const char **maps_spec, const char **affinity_spec,
3680                                           u32 nr_spec)
3681 {
3682         u32 s;
3683         int ret = 0, t = 0;
3684         struct mmap_cpu_mask cpus_mask;
3685         struct thread_mask thread_mask, full_mask, *thread_masks;
3686
3687         ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3688         if (ret) {
3689                 pr_err("Failed to allocate CPUs mask\n");
3690                 return ret;
3691         }
3692
3693         ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3694         if (ret) {
3695                 pr_err("Failed to init cpu mask\n");
3696                 goto out_free_cpu_mask;
3697         }
3698
3699         ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3700         if (ret) {
3701                 pr_err("Failed to allocate full mask\n");
3702                 goto out_free_cpu_mask;
3703         }
3704
3705         ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3706         if (ret) {
3707                 pr_err("Failed to allocate thread mask\n");
3708                 goto out_free_full_and_cpu_masks;
3709         }
3710
3711         for (s = 0; s < nr_spec; s++) {
3712                 ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3713                 if (ret) {
3714                         pr_err("Failed to initialize maps thread mask\n");
3715                         goto out_free;
3716                 }
3717                 ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3718                 if (ret) {
3719                         pr_err("Failed to initialize affinity thread mask\n");
3720                         goto out_free;
3721                 }
3722
3723                 /* ignore invalid CPUs but do not allow empty masks */
3724                 if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3725                                 cpus_mask.bits, thread_mask.maps.nbits)) {
3726                         pr_err("Empty maps mask: %s\n", maps_spec[s]);
3727                         ret = -EINVAL;
3728                         goto out_free;
3729                 }
3730                 if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3731                                 cpus_mask.bits, thread_mask.affinity.nbits)) {
3732                         pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3733                         ret = -EINVAL;
3734                         goto out_free;
3735                 }
3736
3737                 /* do not allow intersection with other masks (full_mask) */
3738                 if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3739                                       thread_mask.maps.nbits)) {
3740                         pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3741                         ret = -EINVAL;
3742                         goto out_free;
3743                 }
3744                 if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3745                                       thread_mask.affinity.nbits)) {
3746                         pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3747                         ret = -EINVAL;
3748                         goto out_free;
3749                 }
3750
3751                 bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3752                           thread_mask.maps.bits, full_mask.maps.nbits);
3753                 bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3754                           thread_mask.affinity.bits, full_mask.maps.nbits);
3755
3756                 thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3757                 if (!thread_masks) {
3758                         pr_err("Failed to reallocate thread masks\n");
3759                         ret = -ENOMEM;
3760                         goto out_free;
3761                 }
3762                 rec->thread_masks = thread_masks;
3763                 rec->thread_masks[t] = thread_mask;
3764                 if (verbose > 0) {
3765                         pr_debug("thread_masks[%d]: ", t);
3766                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3767                         pr_debug("thread_masks[%d]: ", t);
3768                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3769                 }
3770                 t++;
3771                 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3772                 if (ret) {
3773                         pr_err("Failed to allocate thread mask\n");
3774                         goto out_free_full_and_cpu_masks;
3775                 }
3776         }
3777         rec->nr_threads = t;
3778         pr_debug("nr_threads: %d\n", rec->nr_threads);
3779         if (!rec->nr_threads)
3780                 ret = -EINVAL;
3781
3782 out_free:
3783         record__thread_mask_free(&thread_mask);
3784 out_free_full_and_cpu_masks:
3785         record__thread_mask_free(&full_mask);
3786 out_free_cpu_mask:
3787         record__mmap_cpu_mask_free(&cpus_mask);
3788
3789         return ret;
3790 }
3791
3792 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3793 {
3794         int ret;
3795         struct cpu_topology *topo;
3796
3797         topo = cpu_topology__new();
3798         if (!topo) {
3799                 pr_err("Failed to allocate CPU topology\n");
3800                 return -ENOMEM;
3801         }
3802
3803         ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3804                                              topo->core_cpus_list, topo->core_cpus_lists);
3805         cpu_topology__delete(topo);
3806
3807         return ret;
3808 }
3809
3810 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3811 {
3812         int ret;
3813         struct cpu_topology *topo;
3814
3815         topo = cpu_topology__new();
3816         if (!topo) {
3817                 pr_err("Failed to allocate CPU topology\n");
3818                 return -ENOMEM;
3819         }
3820
3821         ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3822                                              topo->package_cpus_list, topo->package_cpus_lists);
3823         cpu_topology__delete(topo);
3824
3825         return ret;
3826 }
3827
3828 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3829 {
3830         u32 s;
3831         int ret;
3832         const char **spec;
3833         struct numa_topology *topo;
3834
3835         topo = numa_topology__new();
3836         if (!topo) {
3837                 pr_err("Failed to allocate NUMA topology\n");
3838                 return -ENOMEM;
3839         }
3840
3841         spec = zalloc(topo->nr * sizeof(char *));
3842         if (!spec) {
3843                 pr_err("Failed to allocate NUMA spec\n");
3844                 ret = -ENOMEM;
3845                 goto out_delete_topo;
3846         }
3847         for (s = 0; s < topo->nr; s++)
3848                 spec[s] = topo->nodes[s].cpus;
3849
3850         ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3851
3852         zfree(&spec);
3853
3854 out_delete_topo:
3855         numa_topology__delete(topo);
3856
3857         return ret;
3858 }
3859
3860 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3861 {
3862         int t, ret;
3863         u32 s, nr_spec = 0;
3864         char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3865         char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3866
3867         for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3868                 spec = strtok_r(user_spec, ":", &spec_ptr);
3869                 if (spec == NULL)
3870                         break;
3871                 pr_debug2("threads_spec[%d]: %s\n", t, spec);
3872                 mask = strtok_r(spec, "/", &mask_ptr);
3873                 if (mask == NULL)
3874                         break;
3875                 pr_debug2("  maps mask: %s\n", mask);
3876                 tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3877                 if (!tmp_spec) {
3878                         pr_err("Failed to reallocate maps spec\n");
3879                         ret = -ENOMEM;
3880                         goto out_free;
3881                 }
3882                 maps_spec = tmp_spec;
3883                 maps_spec[nr_spec] = dup_mask = strdup(mask);
3884                 if (!maps_spec[nr_spec]) {
3885                         pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3886                         ret = -ENOMEM;
3887                         goto out_free;
3888                 }
3889                 mask = strtok_r(NULL, "/", &mask_ptr);
3890                 if (mask == NULL) {
3891                         pr_err("Invalid thread maps or affinity specs\n");
3892                         ret = -EINVAL;
3893                         goto out_free;
3894                 }
3895                 pr_debug2("  affinity mask: %s\n", mask);
3896                 tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3897                 if (!tmp_spec) {
3898                         pr_err("Failed to reallocate affinity spec\n");
3899                         ret = -ENOMEM;
3900                         goto out_free;
3901                 }
3902                 affinity_spec = tmp_spec;
3903                 affinity_spec[nr_spec] = strdup(mask);
3904                 if (!affinity_spec[nr_spec]) {
3905                         pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3906                         ret = -ENOMEM;
3907                         goto out_free;
3908                 }
3909                 dup_mask = NULL;
3910                 nr_spec++;
3911         }
3912
3913         ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3914                                              (const char **)affinity_spec, nr_spec);
3915
3916 out_free:
3917         free(dup_mask);
3918         for (s = 0; s < nr_spec; s++) {
3919                 if (maps_spec)
3920                         free(maps_spec[s]);
3921                 if (affinity_spec)
3922                         free(affinity_spec[s]);
3923         }
3924         free(affinity_spec);
3925         free(maps_spec);
3926
3927         return ret;
3928 }
3929
3930 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3931 {
3932         int ret;
3933
3934         ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3935         if (ret)
3936                 return ret;
3937
3938         if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3939                 return -ENODEV;
3940
3941         rec->nr_threads = 1;
3942
3943         return 0;
3944 }
3945
3946 static int record__init_thread_masks(struct record *rec)
3947 {
3948         int ret = 0;
3949         struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3950
3951         if (!record__threads_enabled(rec))
3952                 return record__init_thread_default_masks(rec, cpus);
3953
3954         if (evlist__per_thread(rec->evlist)) {
3955                 pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3956                 return -EINVAL;
3957         }
3958
3959         switch (rec->opts.threads_spec) {
3960         case THREAD_SPEC__CPU:
3961                 ret = record__init_thread_cpu_masks(rec, cpus);
3962                 break;
3963         case THREAD_SPEC__CORE:
3964                 ret = record__init_thread_core_masks(rec, cpus);
3965                 break;
3966         case THREAD_SPEC__PACKAGE:
3967                 ret = record__init_thread_package_masks(rec, cpus);
3968                 break;
3969         case THREAD_SPEC__NUMA:
3970                 ret = record__init_thread_numa_masks(rec, cpus);
3971                 break;
3972         case THREAD_SPEC__USER:
3973                 ret = record__init_thread_user_masks(rec, cpus);
3974                 break;
3975         default:
3976                 break;
3977         }
3978
3979         return ret;
3980 }
3981
3982 int cmd_record(int argc, const char **argv)
3983 {
3984         int err;
3985         struct record *rec = &record;
3986         char errbuf[BUFSIZ];
3987
3988         setlocale(LC_ALL, "");
3989
3990 #ifndef HAVE_BPF_SKEL
3991 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3992         set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3993 # undef set_nobuild
3994 #endif
3995
3996         /* Disable eager loading of kernel symbols that adds overhead to perf record. */
3997         symbol_conf.lazy_load_kernel_maps = true;
3998         rec->opts.affinity = PERF_AFFINITY_SYS;
3999
4000         rec->evlist = evlist__new();
4001         if (rec->evlist == NULL)
4002                 return -ENOMEM;
4003
4004         err = perf_config(perf_record_config, rec);
4005         if (err)
4006                 return err;
4007
4008         argc = parse_options(argc, argv, record_options, record_usage,
4009                             PARSE_OPT_STOP_AT_NON_OPTION);
4010         if (quiet)
4011                 perf_quiet_option();
4012
4013         err = symbol__validate_sym_arguments();
4014         if (err)
4015                 return err;
4016
4017         perf_debuginfod_setup(&record.debuginfod);
4018
4019         /* Make system wide (-a) the default target. */
4020         if (!argc && target__none(&rec->opts.target))
4021                 rec->opts.target.system_wide = true;
4022
4023         if (nr_cgroups && !rec->opts.target.system_wide) {
4024                 usage_with_options_msg(record_usage, record_options,
4025                         "cgroup monitoring only available in system-wide mode");
4026
4027         }
4028
4029         if (rec->buildid_mmap) {
4030                 if (!perf_can_record_build_id()) {
4031                         pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4032                         err = -EINVAL;
4033                         goto out_opts;
4034                 }
4035                 pr_debug("Enabling build id in mmap2 events.\n");
4036                 /* Enable mmap build id synthesizing. */
4037                 symbol_conf.buildid_mmap2 = true;
4038                 /* Enable perf_event_attr::build_id bit. */
4039                 rec->opts.build_id = true;
4040                 /* Disable build id cache. */
4041                 rec->no_buildid = true;
4042         }
4043
4044         if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4045                 pr_err("Kernel has no cgroup sampling support.\n");
4046                 err = -EINVAL;
4047                 goto out_opts;
4048         }
4049
4050         if (rec->opts.kcore)
4051                 rec->opts.text_poke = true;
4052
4053         if (rec->opts.kcore || record__threads_enabled(rec))
4054                 rec->data.is_dir = true;
4055
4056         if (record__threads_enabled(rec)) {
4057                 if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4058                         pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4059                         goto out_opts;
4060                 }
4061                 if (record__aio_enabled(rec)) {
4062                         pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4063                         goto out_opts;
4064                 }
4065         }
4066
4067         if (rec->opts.comp_level != 0) {
4068                 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4069                 rec->no_buildid = true;
4070         }
4071
4072         if (rec->opts.record_switch_events &&
4073             !perf_can_record_switch_events()) {
4074                 ui__error("kernel does not support recording context switch events\n");
4075                 parse_options_usage(record_usage, record_options, "switch-events", 0);
4076                 err = -EINVAL;
4077                 goto out_opts;
4078         }
4079
4080         if (switch_output_setup(rec)) {
4081                 parse_options_usage(record_usage, record_options, "switch-output", 0);
4082                 err = -EINVAL;
4083                 goto out_opts;
4084         }
4085
4086         if (rec->switch_output.time) {
4087                 signal(SIGALRM, alarm_sig_handler);
4088                 alarm(rec->switch_output.time);
4089         }
4090
4091         if (rec->switch_output.num_files) {
4092                 rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4093                                                       sizeof(char *));
4094                 if (!rec->switch_output.filenames) {
4095                         err = -EINVAL;
4096                         goto out_opts;
4097                 }
4098         }
4099
4100         if (rec->timestamp_filename && record__threads_enabled(rec)) {
4101                 rec->timestamp_filename = false;
4102                 pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4103         }
4104
4105         /*
4106          * Allow aliases to facilitate the lookup of symbols for address
4107          * filters. Refer to auxtrace_parse_filters().
4108          */
4109         symbol_conf.allow_aliases = true;
4110
4111         symbol__init(NULL);
4112
4113         err = record__auxtrace_init(rec);
4114         if (err)
4115                 goto out;
4116
4117         if (dry_run)
4118                 goto out;
4119
4120         err = -ENOMEM;
4121
4122         if (rec->no_buildid_cache || rec->no_buildid) {
4123                 disable_buildid_cache();
4124         } else if (rec->switch_output.enabled) {
4125                 /*
4126                  * In 'perf record --switch-output', disable buildid
4127                  * generation by default to reduce data file switching
4128                  * overhead. Still generate buildid if they are required
4129                  * explicitly using
4130                  *
4131                  *  perf record --switch-output --no-no-buildid \
4132                  *              --no-no-buildid-cache
4133                  *
4134                  * Following code equals to:
4135                  *
4136                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
4137                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4138                  *         disable_buildid_cache();
4139                  */
4140                 bool disable = true;
4141
4142                 if (rec->no_buildid_set && !rec->no_buildid)
4143                         disable = false;
4144                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4145                         disable = false;
4146                 if (disable) {
4147                         rec->no_buildid = true;
4148                         rec->no_buildid_cache = true;
4149                         disable_buildid_cache();
4150                 }
4151         }
4152
4153         if (record.opts.overwrite)
4154                 record.opts.tail_synthesize = true;
4155
4156         if (rec->evlist->core.nr_entries == 0) {
4157                 bool can_profile_kernel = perf_event_paranoid_check(1);
4158
4159                 err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4160                 if (err)
4161                         goto out;
4162         }
4163
4164         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4165                 rec->opts.no_inherit = true;
4166
4167         err = target__validate(&rec->opts.target);
4168         if (err) {
4169                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4170                 ui__warning("%s\n", errbuf);
4171         }
4172
4173         err = target__parse_uid(&rec->opts.target);
4174         if (err) {
4175                 int saved_errno = errno;
4176
4177                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4178                 ui__error("%s", errbuf);
4179
4180                 err = -saved_errno;
4181                 goto out;
4182         }
4183
4184         /* Enable ignoring missing threads when -u/-p option is defined. */
4185         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4186
4187         evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4188
4189         if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4190                 arch__add_leaf_frame_record_opts(&rec->opts);
4191
4192         err = -ENOMEM;
4193         if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4194                 if (rec->opts.target.pid != NULL) {
4195                         pr_err("Couldn't create thread/CPU maps: %s\n",
4196                                 errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4197                         goto out;
4198                 }
4199                 else
4200                         usage_with_options(record_usage, record_options);
4201         }
4202
4203         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4204         if (err)
4205                 goto out;
4206
4207         /*
4208          * We take all buildids when the file contains
4209          * AUX area tracing data because we do not decode the
4210          * trace because it would take too long.
4211          */
4212         if (rec->opts.full_auxtrace)
4213                 rec->buildid_all = true;
4214
4215         if (rec->opts.text_poke) {
4216                 err = record__config_text_poke(rec->evlist);
4217                 if (err) {
4218                         pr_err("record__config_text_poke failed, error %d\n", err);
4219                         goto out;
4220                 }
4221         }
4222
4223         if (rec->off_cpu) {
4224                 err = record__config_off_cpu(rec);
4225                 if (err) {
4226                         pr_err("record__config_off_cpu failed, error %d\n", err);
4227                         goto out;
4228                 }
4229         }
4230
4231         if (record_opts__config(&rec->opts)) {
4232                 err = -EINVAL;
4233                 goto out;
4234         }
4235
4236         err = record__config_tracking_events(rec);
4237         if (err) {
4238                 pr_err("record__config_tracking_events failed, error %d\n", err);
4239                 goto out;
4240         }
4241
4242         err = record__init_thread_masks(rec);
4243         if (err) {
4244                 pr_err("Failed to initialize parallel data streaming masks\n");
4245                 goto out;
4246         }
4247
4248         if (rec->opts.nr_cblocks > nr_cblocks_max)
4249                 rec->opts.nr_cblocks = nr_cblocks_max;
4250         pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4251
4252         pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4253         pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4254
4255         if (rec->opts.comp_level > comp_level_max)
4256                 rec->opts.comp_level = comp_level_max;
4257         pr_debug("comp level: %d\n", rec->opts.comp_level);
4258
4259         err = __cmd_record(&record, argc, argv);
4260 out:
4261         evlist__delete(rec->evlist);
4262         symbol__exit();
4263         auxtrace_record__free(rec->itr);
4264 out_opts:
4265         record__free_thread_masks(rec, rec->nr_threads);
4266         rec->nr_threads = 0;
4267         evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4268         return err;
4269 }
4270
4271 static void snapshot_sig_handler(int sig __maybe_unused)
4272 {
4273         struct record *rec = &record;
4274
4275         hit_auxtrace_snapshot_trigger(rec);
4276
4277         if (switch_output_signal(rec))
4278                 trigger_hit(&switch_output_trigger);
4279 }
4280
4281 static void alarm_sig_handler(int sig __maybe_unused)
4282 {
4283         struct record *rec = &record;
4284
4285         if (switch_output_time(rec))
4286                 trigger_hit(&switch_output_trigger);
4287 }