perf evlist: Adopt backwards ring buffer state enum
[linux-2.6-microblaze.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/llvm-utils.h"
38 #include "util/bpf-loader.h"
39 #include "util/trigger.h"
40 #include "util/perf-hooks.h"
41 #include "util/cpu-set-sched.h"
42 #include "util/synthetic-events.h"
43 #include "util/time-utils.h"
44 #include "util/units.h"
45 #include "util/bpf-event.h"
46 #include "asm/bug.h"
47 #include "perf.h"
48
49 #include <errno.h>
50 #include <inttypes.h>
51 #include <locale.h>
52 #include <poll.h>
53 #include <unistd.h>
54 #include <sched.h>
55 #include <signal.h>
56 #include <sys/mman.h>
57 #include <sys/wait.h>
58 #include <linux/err.h>
59 #include <linux/string.h>
60 #include <linux/time64.h>
61 #include <linux/zalloc.h>
62
63 struct switch_output {
64         bool             enabled;
65         bool             signal;
66         unsigned long    size;
67         unsigned long    time;
68         const char      *str;
69         bool             set;
70         char             **filenames;
71         int              num_files;
72         int              cur_file;
73 };
74
75 struct record {
76         struct perf_tool        tool;
77         struct record_opts      opts;
78         u64                     bytes_written;
79         struct perf_data        data;
80         struct auxtrace_record  *itr;
81         struct evlist   *evlist;
82         struct perf_session     *session;
83         int                     realtime_prio;
84         bool                    no_buildid;
85         bool                    no_buildid_set;
86         bool                    no_buildid_cache;
87         bool                    no_buildid_cache_set;
88         bool                    buildid_all;
89         bool                    timestamp_filename;
90         bool                    timestamp_boundary;
91         struct switch_output    switch_output;
92         unsigned long long      samples;
93         cpu_set_t               affinity_mask;
94 };
95
96 static volatile int auxtrace_record__snapshot_started;
97 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
98 static DEFINE_TRIGGER(switch_output_trigger);
99
100 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
101         "SYS", "NODE", "CPU"
102 };
103
104 static bool switch_output_signal(struct record *rec)
105 {
106         return rec->switch_output.signal &&
107                trigger_is_ready(&switch_output_trigger);
108 }
109
110 static bool switch_output_size(struct record *rec)
111 {
112         return rec->switch_output.size &&
113                trigger_is_ready(&switch_output_trigger) &&
114                (rec->bytes_written >= rec->switch_output.size);
115 }
116
117 static bool switch_output_time(struct record *rec)
118 {
119         return rec->switch_output.time &&
120                trigger_is_ready(&switch_output_trigger);
121 }
122
123 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
124                          void *bf, size_t size)
125 {
126         struct perf_data_file *file = &rec->session->data->file;
127
128         if (perf_data_file__write(file, bf, size) < 0) {
129                 pr_err("failed to write perf data, error: %m\n");
130                 return -1;
131         }
132
133         rec->bytes_written += size;
134
135         if (switch_output_size(rec))
136                 trigger_hit(&switch_output_trigger);
137
138         return 0;
139 }
140
141 static int record__aio_enabled(struct record *rec);
142 static int record__comp_enabled(struct record *rec);
143 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
144                             void *src, size_t src_size);
145
146 #ifdef HAVE_AIO_SUPPORT
147 static int record__aio_write(struct aiocb *cblock, int trace_fd,
148                 void *buf, size_t size, off_t off)
149 {
150         int rc;
151
152         cblock->aio_fildes = trace_fd;
153         cblock->aio_buf    = buf;
154         cblock->aio_nbytes = size;
155         cblock->aio_offset = off;
156         cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
157
158         do {
159                 rc = aio_write(cblock);
160                 if (rc == 0) {
161                         break;
162                 } else if (errno != EAGAIN) {
163                         cblock->aio_fildes = -1;
164                         pr_err("failed to queue perf data, error: %m\n");
165                         break;
166                 }
167         } while (1);
168
169         return rc;
170 }
171
172 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
173 {
174         void *rem_buf;
175         off_t rem_off;
176         size_t rem_size;
177         int rc, aio_errno;
178         ssize_t aio_ret, written;
179
180         aio_errno = aio_error(cblock);
181         if (aio_errno == EINPROGRESS)
182                 return 0;
183
184         written = aio_ret = aio_return(cblock);
185         if (aio_ret < 0) {
186                 if (aio_errno != EINTR)
187                         pr_err("failed to write perf data, error: %m\n");
188                 written = 0;
189         }
190
191         rem_size = cblock->aio_nbytes - written;
192
193         if (rem_size == 0) {
194                 cblock->aio_fildes = -1;
195                 /*
196                  * md->refcount is incremented in record__aio_pushfn() for
197                  * every aio write request started in record__aio_push() so
198                  * decrement it because the request is now complete.
199                  */
200                 perf_mmap__put(md);
201                 rc = 1;
202         } else {
203                 /*
204                  * aio write request may require restart with the
205                  * reminder if the kernel didn't write whole
206                  * chunk at once.
207                  */
208                 rem_off = cblock->aio_offset + written;
209                 rem_buf = (void *)(cblock->aio_buf + written);
210                 record__aio_write(cblock, cblock->aio_fildes,
211                                 rem_buf, rem_size, rem_off);
212                 rc = 0;
213         }
214
215         return rc;
216 }
217
218 static int record__aio_sync(struct mmap *md, bool sync_all)
219 {
220         struct aiocb **aiocb = md->aio.aiocb;
221         struct aiocb *cblocks = md->aio.cblocks;
222         struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
223         int i, do_suspend;
224
225         do {
226                 do_suspend = 0;
227                 for (i = 0; i < md->aio.nr_cblocks; ++i) {
228                         if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
229                                 if (sync_all)
230                                         aiocb[i] = NULL;
231                                 else
232                                         return i;
233                         } else {
234                                 /*
235                                  * Started aio write is not complete yet
236                                  * so it has to be waited before the
237                                  * next allocation.
238                                  */
239                                 aiocb[i] = &cblocks[i];
240                                 do_suspend = 1;
241                         }
242                 }
243                 if (!do_suspend)
244                         return -1;
245
246                 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
247                         if (!(errno == EAGAIN || errno == EINTR))
248                                 pr_err("failed to sync perf data, error: %m\n");
249                 }
250         } while (1);
251 }
252
253 struct record_aio {
254         struct record   *rec;
255         void            *data;
256         size_t          size;
257 };
258
259 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
260 {
261         struct record_aio *aio = to;
262
263         /*
264          * map->base data pointed by buf is copied into free map->aio.data[] buffer
265          * to release space in the kernel buffer as fast as possible, calling
266          * perf_mmap__consume() from perf_mmap__push() function.
267          *
268          * That lets the kernel to proceed with storing more profiling data into
269          * the kernel buffer earlier than other per-cpu kernel buffers are handled.
270          *
271          * Coping can be done in two steps in case the chunk of profiling data
272          * crosses the upper bound of the kernel buffer. In this case we first move
273          * part of data from map->start till the upper bound and then the reminder
274          * from the beginning of the kernel buffer till the end of the data chunk.
275          */
276
277         if (record__comp_enabled(aio->rec)) {
278                 size = zstd_compress(aio->rec->session, aio->data + aio->size,
279                                      perf_mmap__mmap_len(map) - aio->size,
280                                      buf, size);
281         } else {
282                 memcpy(aio->data + aio->size, buf, size);
283         }
284
285         if (!aio->size) {
286                 /*
287                  * Increment map->refcount to guard map->aio.data[] buffer
288                  * from premature deallocation because map object can be
289                  * released earlier than aio write request started on
290                  * map->aio.data[] buffer is complete.
291                  *
292                  * perf_mmap__put() is done at record__aio_complete()
293                  * after started aio request completion or at record__aio_push()
294                  * if the request failed to start.
295                  */
296                 perf_mmap__get(map);
297         }
298
299         aio->size += size;
300
301         return size;
302 }
303
304 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
305 {
306         int ret, idx;
307         int trace_fd = rec->session->data->file.fd;
308         struct record_aio aio = { .rec = rec, .size = 0 };
309
310         /*
311          * Call record__aio_sync() to wait till map->aio.data[] buffer
312          * becomes available after previous aio write operation.
313          */
314
315         idx = record__aio_sync(map, false);
316         aio.data = map->aio.data[idx];
317         ret = perf_mmap__push(map, &aio, record__aio_pushfn);
318         if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
319                 return ret;
320
321         rec->samples++;
322         ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
323         if (!ret) {
324                 *off += aio.size;
325                 rec->bytes_written += aio.size;
326                 if (switch_output_size(rec))
327                         trigger_hit(&switch_output_trigger);
328         } else {
329                 /*
330                  * Decrement map->refcount incremented in record__aio_pushfn()
331                  * back if record__aio_write() operation failed to start, otherwise
332                  * map->refcount is decremented in record__aio_complete() after
333                  * aio write operation finishes successfully.
334                  */
335                 perf_mmap__put(map);
336         }
337
338         return ret;
339 }
340
341 static off_t record__aio_get_pos(int trace_fd)
342 {
343         return lseek(trace_fd, 0, SEEK_CUR);
344 }
345
346 static void record__aio_set_pos(int trace_fd, off_t pos)
347 {
348         lseek(trace_fd, pos, SEEK_SET);
349 }
350
351 static void record__aio_mmap_read_sync(struct record *rec)
352 {
353         int i;
354         struct evlist *evlist = rec->evlist;
355         struct mmap *maps = evlist->mmap;
356
357         if (!record__aio_enabled(rec))
358                 return;
359
360         for (i = 0; i < evlist->nr_mmaps; i++) {
361                 struct mmap *map = &maps[i];
362
363                 if (map->base)
364                         record__aio_sync(map, true);
365         }
366 }
367
368 static int nr_cblocks_default = 1;
369 static int nr_cblocks_max = 4;
370
371 static int record__aio_parse(const struct option *opt,
372                              const char *str,
373                              int unset)
374 {
375         struct record_opts *opts = (struct record_opts *)opt->value;
376
377         if (unset) {
378                 opts->nr_cblocks = 0;
379         } else {
380                 if (str)
381                         opts->nr_cblocks = strtol(str, NULL, 0);
382                 if (!opts->nr_cblocks)
383                         opts->nr_cblocks = nr_cblocks_default;
384         }
385
386         return 0;
387 }
388 #else /* HAVE_AIO_SUPPORT */
389 static int nr_cblocks_max = 0;
390
391 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
392                             off_t *off __maybe_unused)
393 {
394         return -1;
395 }
396
397 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
398 {
399         return -1;
400 }
401
402 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
403 {
404 }
405
406 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
407 {
408 }
409 #endif
410
411 static int record__aio_enabled(struct record *rec)
412 {
413         return rec->opts.nr_cblocks > 0;
414 }
415
416 #define MMAP_FLUSH_DEFAULT 1
417 static int record__mmap_flush_parse(const struct option *opt,
418                                     const char *str,
419                                     int unset)
420 {
421         int flush_max;
422         struct record_opts *opts = (struct record_opts *)opt->value;
423         static struct parse_tag tags[] = {
424                         { .tag  = 'B', .mult = 1       },
425                         { .tag  = 'K', .mult = 1 << 10 },
426                         { .tag  = 'M', .mult = 1 << 20 },
427                         { .tag  = 'G', .mult = 1 << 30 },
428                         { .tag  = 0 },
429         };
430
431         if (unset)
432                 return 0;
433
434         if (str) {
435                 opts->mmap_flush = parse_tag_value(str, tags);
436                 if (opts->mmap_flush == (int)-1)
437                         opts->mmap_flush = strtol(str, NULL, 0);
438         }
439
440         if (!opts->mmap_flush)
441                 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
442
443         flush_max = evlist__mmap_size(opts->mmap_pages);
444         flush_max /= 4;
445         if (opts->mmap_flush > flush_max)
446                 opts->mmap_flush = flush_max;
447
448         return 0;
449 }
450
451 #ifdef HAVE_ZSTD_SUPPORT
452 static unsigned int comp_level_default = 1;
453
454 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
455 {
456         struct record_opts *opts = opt->value;
457
458         if (unset) {
459                 opts->comp_level = 0;
460         } else {
461                 if (str)
462                         opts->comp_level = strtol(str, NULL, 0);
463                 if (!opts->comp_level)
464                         opts->comp_level = comp_level_default;
465         }
466
467         return 0;
468 }
469 #endif
470 static unsigned int comp_level_max = 22;
471
472 static int record__comp_enabled(struct record *rec)
473 {
474         return rec->opts.comp_level > 0;
475 }
476
477 static int process_synthesized_event(struct perf_tool *tool,
478                                      union perf_event *event,
479                                      struct perf_sample *sample __maybe_unused,
480                                      struct machine *machine __maybe_unused)
481 {
482         struct record *rec = container_of(tool, struct record, tool);
483         return record__write(rec, NULL, event, event->header.size);
484 }
485
486 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
487 {
488         struct record *rec = to;
489
490         if (record__comp_enabled(rec)) {
491                 size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
492                 bf   = map->data;
493         }
494
495         rec->samples++;
496         return record__write(rec, map, bf, size);
497 }
498
499 static volatile int done;
500 static volatile int signr = -1;
501 static volatile int child_finished;
502
503 static void sig_handler(int sig)
504 {
505         if (sig == SIGCHLD)
506                 child_finished = 1;
507         else
508                 signr = sig;
509
510         done = 1;
511 }
512
513 static void sigsegv_handler(int sig)
514 {
515         perf_hooks__recover();
516         sighandler_dump_stack(sig);
517 }
518
519 static void record__sig_exit(void)
520 {
521         if (signr == -1)
522                 return;
523
524         signal(signr, SIG_DFL);
525         raise(signr);
526 }
527
528 #ifdef HAVE_AUXTRACE_SUPPORT
529
530 static int record__process_auxtrace(struct perf_tool *tool,
531                                     struct mmap *map,
532                                     union perf_event *event, void *data1,
533                                     size_t len1, void *data2, size_t len2)
534 {
535         struct record *rec = container_of(tool, struct record, tool);
536         struct perf_data *data = &rec->data;
537         size_t padding;
538         u8 pad[8] = {0};
539
540         if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
541                 off_t file_offset;
542                 int fd = perf_data__fd(data);
543                 int err;
544
545                 file_offset = lseek(fd, 0, SEEK_CUR);
546                 if (file_offset == -1)
547                         return -1;
548                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
549                                                      event, file_offset);
550                 if (err)
551                         return err;
552         }
553
554         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
555         padding = (len1 + len2) & 7;
556         if (padding)
557                 padding = 8 - padding;
558
559         record__write(rec, map, event, event->header.size);
560         record__write(rec, map, data1, len1);
561         if (len2)
562                 record__write(rec, map, data2, len2);
563         record__write(rec, map, &pad, padding);
564
565         return 0;
566 }
567
568 static int record__auxtrace_mmap_read(struct record *rec,
569                                       struct mmap *map)
570 {
571         int ret;
572
573         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
574                                   record__process_auxtrace);
575         if (ret < 0)
576                 return ret;
577
578         if (ret)
579                 rec->samples++;
580
581         return 0;
582 }
583
584 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
585                                                struct mmap *map)
586 {
587         int ret;
588
589         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
590                                            record__process_auxtrace,
591                                            rec->opts.auxtrace_snapshot_size);
592         if (ret < 0)
593                 return ret;
594
595         if (ret)
596                 rec->samples++;
597
598         return 0;
599 }
600
601 static int record__auxtrace_read_snapshot_all(struct record *rec)
602 {
603         int i;
604         int rc = 0;
605
606         for (i = 0; i < rec->evlist->nr_mmaps; i++) {
607                 struct mmap *map = &rec->evlist->mmap[i];
608
609                 if (!map->auxtrace_mmap.base)
610                         continue;
611
612                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
613                         rc = -1;
614                         goto out;
615                 }
616         }
617 out:
618         return rc;
619 }
620
621 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
622 {
623         pr_debug("Recording AUX area tracing snapshot\n");
624         if (record__auxtrace_read_snapshot_all(rec) < 0) {
625                 trigger_error(&auxtrace_snapshot_trigger);
626         } else {
627                 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
628                         trigger_error(&auxtrace_snapshot_trigger);
629                 else
630                         trigger_ready(&auxtrace_snapshot_trigger);
631         }
632 }
633
634 static int record__auxtrace_snapshot_exit(struct record *rec)
635 {
636         if (trigger_is_error(&auxtrace_snapshot_trigger))
637                 return 0;
638
639         if (!auxtrace_record__snapshot_started &&
640             auxtrace_record__snapshot_start(rec->itr))
641                 return -1;
642
643         record__read_auxtrace_snapshot(rec, true);
644         if (trigger_is_error(&auxtrace_snapshot_trigger))
645                 return -1;
646
647         return 0;
648 }
649
650 static int record__auxtrace_init(struct record *rec)
651 {
652         int err;
653
654         if (!rec->itr) {
655                 rec->itr = auxtrace_record__init(rec->evlist, &err);
656                 if (err)
657                         return err;
658         }
659
660         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
661                                               rec->opts.auxtrace_snapshot_opts);
662         if (err)
663                 return err;
664
665         return auxtrace_parse_filters(rec->evlist);
666 }
667
668 #else
669
670 static inline
671 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
672                                struct mmap *map __maybe_unused)
673 {
674         return 0;
675 }
676
677 static inline
678 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
679                                     bool on_exit __maybe_unused)
680 {
681 }
682
683 static inline
684 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
685 {
686         return 0;
687 }
688
689 static inline
690 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
691 {
692         return 0;
693 }
694
695 static int record__auxtrace_init(struct record *rec __maybe_unused)
696 {
697         return 0;
698 }
699
700 #endif
701
702 static int record__mmap_evlist(struct record *rec,
703                                struct evlist *evlist)
704 {
705         struct record_opts *opts = &rec->opts;
706         char msg[512];
707
708         if (opts->affinity != PERF_AFFINITY_SYS)
709                 cpu__setup_cpunode_map();
710
711         if (evlist__mmap_ex(evlist, opts->mmap_pages,
712                                  opts->auxtrace_mmap_pages,
713                                  opts->auxtrace_snapshot_mode,
714                                  opts->nr_cblocks, opts->affinity,
715                                  opts->mmap_flush, opts->comp_level) < 0) {
716                 if (errno == EPERM) {
717                         pr_err("Permission error mapping pages.\n"
718                                "Consider increasing "
719                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
720                                "or try again with a smaller value of -m/--mmap_pages.\n"
721                                "(current value: %u,%u)\n",
722                                opts->mmap_pages, opts->auxtrace_mmap_pages);
723                         return -errno;
724                 } else {
725                         pr_err("failed to mmap with %d (%s)\n", errno,
726                                 str_error_r(errno, msg, sizeof(msg)));
727                         if (errno)
728                                 return -errno;
729                         else
730                                 return -EINVAL;
731                 }
732         }
733         return 0;
734 }
735
736 static int record__mmap(struct record *rec)
737 {
738         return record__mmap_evlist(rec, rec->evlist);
739 }
740
741 static int record__open(struct record *rec)
742 {
743         char msg[BUFSIZ];
744         struct evsel *pos;
745         struct evlist *evlist = rec->evlist;
746         struct perf_session *session = rec->session;
747         struct record_opts *opts = &rec->opts;
748         int rc = 0;
749
750         /*
751          * For initial_delay we need to add a dummy event so that we can track
752          * PERF_RECORD_MMAP while we wait for the initial delay to enable the
753          * real events, the ones asked by the user.
754          */
755         if (opts->initial_delay) {
756                 if (perf_evlist__add_dummy(evlist))
757                         return -ENOMEM;
758
759                 pos = perf_evlist__first(evlist);
760                 pos->tracking = 0;
761                 pos = perf_evlist__last(evlist);
762                 pos->tracking = 1;
763                 pos->core.attr.enable_on_exec = 1;
764         }
765
766         perf_evlist__config(evlist, opts, &callchain_param);
767
768         evlist__for_each_entry(evlist, pos) {
769 try_again:
770                 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
771                         if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
772                                 if (verbose > 0)
773                                         ui__warning("%s\n", msg);
774                                 goto try_again;
775                         }
776                         if ((errno == EINVAL || errno == EBADF) &&
777                             pos->leader != pos &&
778                             pos->weak_group) {
779                                 pos = perf_evlist__reset_weak_group(evlist, pos);
780                                 goto try_again;
781                         }
782                         rc = -errno;
783                         perf_evsel__open_strerror(pos, &opts->target,
784                                                   errno, msg, sizeof(msg));
785                         ui__error("%s\n", msg);
786                         goto out;
787                 }
788
789                 pos->supported = true;
790         }
791
792         if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
793                 pr_warning(
794 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
795 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
796 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
797 "file is not found in the buildid cache or in the vmlinux path.\n\n"
798 "Samples in kernel modules won't be resolved at all.\n\n"
799 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
800 "even with a suitable vmlinux or kallsyms file.\n\n");
801         }
802
803         if (perf_evlist__apply_filters(evlist, &pos)) {
804                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
805                         pos->filter, perf_evsel__name(pos), errno,
806                         str_error_r(errno, msg, sizeof(msg)));
807                 rc = -1;
808                 goto out;
809         }
810
811         rc = record__mmap(rec);
812         if (rc)
813                 goto out;
814
815         session->evlist = evlist;
816         perf_session__set_id_hdr_size(session);
817 out:
818         return rc;
819 }
820
821 static int process_sample_event(struct perf_tool *tool,
822                                 union perf_event *event,
823                                 struct perf_sample *sample,
824                                 struct evsel *evsel,
825                                 struct machine *machine)
826 {
827         struct record *rec = container_of(tool, struct record, tool);
828
829         if (rec->evlist->first_sample_time == 0)
830                 rec->evlist->first_sample_time = sample->time;
831
832         rec->evlist->last_sample_time = sample->time;
833
834         if (rec->buildid_all)
835                 return 0;
836
837         rec->samples++;
838         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
839 }
840
841 static int process_buildids(struct record *rec)
842 {
843         struct perf_session *session = rec->session;
844
845         if (perf_data__size(&rec->data) == 0)
846                 return 0;
847
848         /*
849          * During this process, it'll load kernel map and replace the
850          * dso->long_name to a real pathname it found.  In this case
851          * we prefer the vmlinux path like
852          *   /lib/modules/3.16.4/build/vmlinux
853          *
854          * rather than build-id path (in debug directory).
855          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
856          */
857         symbol_conf.ignore_vmlinux_buildid = true;
858
859         /*
860          * If --buildid-all is given, it marks all DSO regardless of hits,
861          * so no need to process samples. But if timestamp_boundary is enabled,
862          * it still needs to walk on all samples to get the timestamps of
863          * first/last samples.
864          */
865         if (rec->buildid_all && !rec->timestamp_boundary)
866                 rec->tool.sample = NULL;
867
868         return perf_session__process_events(session);
869 }
870
871 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
872 {
873         int err;
874         struct perf_tool *tool = data;
875         /*
876          *As for guest kernel when processing subcommand record&report,
877          *we arrange module mmap prior to guest kernel mmap and trigger
878          *a preload dso because default guest module symbols are loaded
879          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
880          *method is used to avoid symbol missing when the first addr is
881          *in module instead of in guest kernel.
882          */
883         err = perf_event__synthesize_modules(tool, process_synthesized_event,
884                                              machine);
885         if (err < 0)
886                 pr_err("Couldn't record guest kernel [%d]'s reference"
887                        " relocation symbol.\n", machine->pid);
888
889         /*
890          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
891          * have no _text sometimes.
892          */
893         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
894                                                  machine);
895         if (err < 0)
896                 pr_err("Couldn't record guest kernel [%d]'s reference"
897                        " relocation symbol.\n", machine->pid);
898 }
899
900 static struct perf_event_header finished_round_event = {
901         .size = sizeof(struct perf_event_header),
902         .type = PERF_RECORD_FINISHED_ROUND,
903 };
904
905 static void record__adjust_affinity(struct record *rec, struct mmap *map)
906 {
907         if (rec->opts.affinity != PERF_AFFINITY_SYS &&
908             !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
909                 CPU_ZERO(&rec->affinity_mask);
910                 CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
911                 sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
912         }
913 }
914
915 static size_t process_comp_header(void *record, size_t increment)
916 {
917         struct perf_record_compressed *event = record;
918         size_t size = sizeof(*event);
919
920         if (increment) {
921                 event->header.size += increment;
922                 return increment;
923         }
924
925         event->header.type = PERF_RECORD_COMPRESSED;
926         event->header.size = size;
927
928         return size;
929 }
930
931 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
932                             void *src, size_t src_size)
933 {
934         size_t compressed;
935         size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
936
937         compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
938                                                      max_record_size, process_comp_header);
939
940         session->bytes_transferred += src_size;
941         session->bytes_compressed  += compressed;
942
943         return compressed;
944 }
945
946 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
947                                     bool overwrite, bool synch)
948 {
949         u64 bytes_written = rec->bytes_written;
950         int i;
951         int rc = 0;
952         struct mmap *maps;
953         int trace_fd = rec->data.file.fd;
954         off_t off = 0;
955
956         if (!evlist)
957                 return 0;
958
959         maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
960         if (!maps)
961                 return 0;
962
963         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
964                 return 0;
965
966         if (record__aio_enabled(rec))
967                 off = record__aio_get_pos(trace_fd);
968
969         for (i = 0; i < evlist->nr_mmaps; i++) {
970                 u64 flush = 0;
971                 struct mmap *map = &maps[i];
972
973                 if (map->base) {
974                         record__adjust_affinity(rec, map);
975                         if (synch) {
976                                 flush = map->flush;
977                                 map->flush = 1;
978                         }
979                         if (!record__aio_enabled(rec)) {
980                                 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
981                                         if (synch)
982                                                 map->flush = flush;
983                                         rc = -1;
984                                         goto out;
985                                 }
986                         } else {
987                                 if (record__aio_push(rec, map, &off) < 0) {
988                                         record__aio_set_pos(trace_fd, off);
989                                         if (synch)
990                                                 map->flush = flush;
991                                         rc = -1;
992                                         goto out;
993                                 }
994                         }
995                         if (synch)
996                                 map->flush = flush;
997                 }
998
999                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1000                     record__auxtrace_mmap_read(rec, map) != 0) {
1001                         rc = -1;
1002                         goto out;
1003                 }
1004         }
1005
1006         if (record__aio_enabled(rec))
1007                 record__aio_set_pos(trace_fd, off);
1008
1009         /*
1010          * Mark the round finished in case we wrote
1011          * at least one event.
1012          */
1013         if (bytes_written != rec->bytes_written)
1014                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1015
1016         if (overwrite)
1017                 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1018 out:
1019         return rc;
1020 }
1021
1022 static int record__mmap_read_all(struct record *rec, bool synch)
1023 {
1024         int err;
1025
1026         err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1027         if (err)
1028                 return err;
1029
1030         return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1031 }
1032
1033 static void record__init_features(struct record *rec)
1034 {
1035         struct perf_session *session = rec->session;
1036         int feat;
1037
1038         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1039                 perf_header__set_feat(&session->header, feat);
1040
1041         if (rec->no_buildid)
1042                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1043
1044         if (!have_tracepoints(&rec->evlist->core.entries))
1045                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1046
1047         if (!rec->opts.branch_stack)
1048                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1049
1050         if (!rec->opts.full_auxtrace)
1051                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1052
1053         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1054                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1055
1056         perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1057         if (!record__comp_enabled(rec))
1058                 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1059
1060         perf_header__clear_feat(&session->header, HEADER_STAT);
1061 }
1062
1063 static void
1064 record__finish_output(struct record *rec)
1065 {
1066         struct perf_data *data = &rec->data;
1067         int fd = perf_data__fd(data);
1068
1069         if (data->is_pipe)
1070                 return;
1071
1072         rec->session->header.data_size += rec->bytes_written;
1073         data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1074
1075         if (!rec->no_buildid) {
1076                 process_buildids(rec);
1077
1078                 if (rec->buildid_all)
1079                         dsos__hit_all(rec->session);
1080         }
1081         perf_session__write_header(rec->session, rec->evlist, fd, true);
1082
1083         return;
1084 }
1085
1086 static int record__synthesize_workload(struct record *rec, bool tail)
1087 {
1088         int err;
1089         struct perf_thread_map *thread_map;
1090
1091         if (rec->opts.tail_synthesize != tail)
1092                 return 0;
1093
1094         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1095         if (thread_map == NULL)
1096                 return -1;
1097
1098         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1099                                                  process_synthesized_event,
1100                                                  &rec->session->machines.host,
1101                                                  rec->opts.sample_address);
1102         perf_thread_map__put(thread_map);
1103         return err;
1104 }
1105
1106 static int record__synthesize(struct record *rec, bool tail);
1107
1108 static int
1109 record__switch_output(struct record *rec, bool at_exit)
1110 {
1111         struct perf_data *data = &rec->data;
1112         int fd, err;
1113         char *new_filename;
1114
1115         /* Same Size:      "2015122520103046"*/
1116         char timestamp[] = "InvalidTimestamp";
1117
1118         record__aio_mmap_read_sync(rec);
1119
1120         record__synthesize(rec, true);
1121         if (target__none(&rec->opts.target))
1122                 record__synthesize_workload(rec, true);
1123
1124         rec->samples = 0;
1125         record__finish_output(rec);
1126         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1127         if (err) {
1128                 pr_err("Failed to get current timestamp\n");
1129                 return -EINVAL;
1130         }
1131
1132         fd = perf_data__switch(data, timestamp,
1133                                     rec->session->header.data_offset,
1134                                     at_exit, &new_filename);
1135         if (fd >= 0 && !at_exit) {
1136                 rec->bytes_written = 0;
1137                 rec->session->header.data_size = 0;
1138         }
1139
1140         if (!quiet)
1141                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1142                         data->path, timestamp);
1143
1144         if (rec->switch_output.num_files) {
1145                 int n = rec->switch_output.cur_file + 1;
1146
1147                 if (n >= rec->switch_output.num_files)
1148                         n = 0;
1149                 rec->switch_output.cur_file = n;
1150                 if (rec->switch_output.filenames[n]) {
1151                         remove(rec->switch_output.filenames[n]);
1152                         zfree(&rec->switch_output.filenames[n]);
1153                 }
1154                 rec->switch_output.filenames[n] = new_filename;
1155         } else {
1156                 free(new_filename);
1157         }
1158
1159         /* Output tracking events */
1160         if (!at_exit) {
1161                 record__synthesize(rec, false);
1162
1163                 /*
1164                  * In 'perf record --switch-output' without -a,
1165                  * record__synthesize() in record__switch_output() won't
1166                  * generate tracking events because there's no thread_map
1167                  * in evlist. Which causes newly created perf.data doesn't
1168                  * contain map and comm information.
1169                  * Create a fake thread_map and directly call
1170                  * perf_event__synthesize_thread_map() for those events.
1171                  */
1172                 if (target__none(&rec->opts.target))
1173                         record__synthesize_workload(rec, false);
1174         }
1175         return fd;
1176 }
1177
1178 static volatile int workload_exec_errno;
1179
1180 /*
1181  * perf_evlist__prepare_workload will send a SIGUSR1
1182  * if the fork fails, since we asked by setting its
1183  * want_signal to true.
1184  */
1185 static void workload_exec_failed_signal(int signo __maybe_unused,
1186                                         siginfo_t *info,
1187                                         void *ucontext __maybe_unused)
1188 {
1189         workload_exec_errno = info->si_value.sival_int;
1190         done = 1;
1191         child_finished = 1;
1192 }
1193
1194 static void snapshot_sig_handler(int sig);
1195 static void alarm_sig_handler(int sig);
1196
1197 static const struct perf_event_mmap_page *
1198 perf_evlist__pick_pc(struct evlist *evlist)
1199 {
1200         if (evlist) {
1201                 if (evlist->mmap && evlist->mmap[0].base)
1202                         return evlist->mmap[0].base;
1203                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
1204                         return evlist->overwrite_mmap[0].base;
1205         }
1206         return NULL;
1207 }
1208
1209 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1210 {
1211         const struct perf_event_mmap_page *pc;
1212
1213         pc = perf_evlist__pick_pc(rec->evlist);
1214         if (pc)
1215                 return pc;
1216         return NULL;
1217 }
1218
1219 static int record__synthesize(struct record *rec, bool tail)
1220 {
1221         struct perf_session *session = rec->session;
1222         struct machine *machine = &session->machines.host;
1223         struct perf_data *data = &rec->data;
1224         struct record_opts *opts = &rec->opts;
1225         struct perf_tool *tool = &rec->tool;
1226         int fd = perf_data__fd(data);
1227         int err = 0;
1228
1229         if (rec->opts.tail_synthesize != tail)
1230                 return 0;
1231
1232         if (data->is_pipe) {
1233                 /*
1234                  * We need to synthesize events first, because some
1235                  * features works on top of them (on report side).
1236                  */
1237                 err = perf_event__synthesize_attrs(tool, rec->evlist,
1238                                                    process_synthesized_event);
1239                 if (err < 0) {
1240                         pr_err("Couldn't synthesize attrs.\n");
1241                         goto out;
1242                 }
1243
1244                 err = perf_event__synthesize_features(tool, session, rec->evlist,
1245                                                       process_synthesized_event);
1246                 if (err < 0) {
1247                         pr_err("Couldn't synthesize features.\n");
1248                         return err;
1249                 }
1250
1251                 if (have_tracepoints(&rec->evlist->core.entries)) {
1252                         /*
1253                          * FIXME err <= 0 here actually means that
1254                          * there were no tracepoints so its not really
1255                          * an error, just that we don't need to
1256                          * synthesize anything.  We really have to
1257                          * return this more properly and also
1258                          * propagate errors that now are calling die()
1259                          */
1260                         err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1261                                                                   process_synthesized_event);
1262                         if (err <= 0) {
1263                                 pr_err("Couldn't record tracing data.\n");
1264                                 goto out;
1265                         }
1266                         rec->bytes_written += err;
1267                 }
1268         }
1269
1270         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1271                                           process_synthesized_event, machine);
1272         if (err)
1273                 goto out;
1274
1275         if (rec->opts.full_auxtrace) {
1276                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1277                                         session, process_synthesized_event);
1278                 if (err)
1279                         goto out;
1280         }
1281
1282         if (!perf_evlist__exclude_kernel(rec->evlist)) {
1283                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1284                                                          machine);
1285                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1286                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1287                                    "Check /proc/kallsyms permission or run as root.\n");
1288
1289                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1290                                                      machine);
1291                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1292                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1293                                    "Check /proc/modules permission or run as root.\n");
1294         }
1295
1296         if (perf_guest) {
1297                 machines__process_guests(&session->machines,
1298                                          perf_event__synthesize_guest_os, tool);
1299         }
1300
1301         err = perf_event__synthesize_extra_attr(&rec->tool,
1302                                                 rec->evlist,
1303                                                 process_synthesized_event,
1304                                                 data->is_pipe);
1305         if (err)
1306                 goto out;
1307
1308         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1309                                                  process_synthesized_event,
1310                                                 NULL);
1311         if (err < 0) {
1312                 pr_err("Couldn't synthesize thread map.\n");
1313                 return err;
1314         }
1315
1316         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1317                                              process_synthesized_event, NULL);
1318         if (err < 0) {
1319                 pr_err("Couldn't synthesize cpu map.\n");
1320                 return err;
1321         }
1322
1323         err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1324                                                 machine, opts);
1325         if (err < 0)
1326                 pr_warning("Couldn't synthesize bpf events.\n");
1327
1328         err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1329                                             process_synthesized_event, opts->sample_address,
1330                                             1);
1331 out:
1332         return err;
1333 }
1334
1335 static int __cmd_record(struct record *rec, int argc, const char **argv)
1336 {
1337         int err;
1338         int status = 0;
1339         unsigned long waking = 0;
1340         const bool forks = argc > 0;
1341         struct perf_tool *tool = &rec->tool;
1342         struct record_opts *opts = &rec->opts;
1343         struct perf_data *data = &rec->data;
1344         struct perf_session *session;
1345         bool disabled = false, draining = false;
1346         struct evlist *sb_evlist = NULL;
1347         int fd;
1348         float ratio = 0;
1349
1350         atexit(record__sig_exit);
1351         signal(SIGCHLD, sig_handler);
1352         signal(SIGINT, sig_handler);
1353         signal(SIGTERM, sig_handler);
1354         signal(SIGSEGV, sigsegv_handler);
1355
1356         if (rec->opts.record_namespaces)
1357                 tool->namespace_events = true;
1358
1359         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1360                 signal(SIGUSR2, snapshot_sig_handler);
1361                 if (rec->opts.auxtrace_snapshot_mode)
1362                         trigger_on(&auxtrace_snapshot_trigger);
1363                 if (rec->switch_output.enabled)
1364                         trigger_on(&switch_output_trigger);
1365         } else {
1366                 signal(SIGUSR2, SIG_IGN);
1367         }
1368
1369         session = perf_session__new(data, false, tool);
1370         if (IS_ERR(session)) {
1371                 pr_err("Perf session creation failed.\n");
1372                 return PTR_ERR(session);
1373         }
1374
1375         fd = perf_data__fd(data);
1376         rec->session = session;
1377
1378         if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1379                 pr_err("Compression initialization failed.\n");
1380                 return -1;
1381         }
1382
1383         session->header.env.comp_type  = PERF_COMP_ZSTD;
1384         session->header.env.comp_level = rec->opts.comp_level;
1385
1386         record__init_features(rec);
1387
1388         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1389                 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1390
1391         if (forks) {
1392                 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1393                                                     argv, data->is_pipe,
1394                                                     workload_exec_failed_signal);
1395                 if (err < 0) {
1396                         pr_err("Couldn't run the workload!\n");
1397                         status = err;
1398                         goto out_delete_session;
1399                 }
1400         }
1401
1402         /*
1403          * If we have just single event and are sending data
1404          * through pipe, we need to force the ids allocation,
1405          * because we synthesize event name through the pipe
1406          * and need the id for that.
1407          */
1408         if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1409                 rec->opts.sample_id = true;
1410
1411         if (record__open(rec) != 0) {
1412                 err = -1;
1413                 goto out_child;
1414         }
1415         session->header.env.comp_mmap_len = session->evlist->mmap_len;
1416
1417         err = bpf__apply_obj_config();
1418         if (err) {
1419                 char errbuf[BUFSIZ];
1420
1421                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1422                 pr_err("ERROR: Apply config to BPF failed: %s\n",
1423                          errbuf);
1424                 goto out_child;
1425         }
1426
1427         /*
1428          * Normally perf_session__new would do this, but it doesn't have the
1429          * evlist.
1430          */
1431         if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1432                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1433                 rec->tool.ordered_events = false;
1434         }
1435
1436         if (!rec->evlist->nr_groups)
1437                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1438
1439         if (data->is_pipe) {
1440                 err = perf_header__write_pipe(fd);
1441                 if (err < 0)
1442                         goto out_child;
1443         } else {
1444                 err = perf_session__write_header(session, rec->evlist, fd, false);
1445                 if (err < 0)
1446                         goto out_child;
1447         }
1448
1449         if (!rec->no_buildid
1450             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1451                 pr_err("Couldn't generate buildids. "
1452                        "Use --no-buildid to profile anyway.\n");
1453                 err = -1;
1454                 goto out_child;
1455         }
1456
1457         if (!opts->no_bpf_event)
1458                 bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1459
1460         if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1461                 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1462                 opts->no_bpf_event = true;
1463         }
1464
1465         err = record__synthesize(rec, false);
1466         if (err < 0)
1467                 goto out_child;
1468
1469         if (rec->realtime_prio) {
1470                 struct sched_param param;
1471
1472                 param.sched_priority = rec->realtime_prio;
1473                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1474                         pr_err("Could not set realtime priority.\n");
1475                         err = -1;
1476                         goto out_child;
1477                 }
1478         }
1479
1480         /*
1481          * When perf is starting the traced process, all the events
1482          * (apart from group members) have enable_on_exec=1 set,
1483          * so don't spoil it by prematurely enabling them.
1484          */
1485         if (!target__none(&opts->target) && !opts->initial_delay)
1486                 evlist__enable(rec->evlist);
1487
1488         /*
1489          * Let the child rip
1490          */
1491         if (forks) {
1492                 struct machine *machine = &session->machines.host;
1493                 union perf_event *event;
1494                 pid_t tgid;
1495
1496                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1497                 if (event == NULL) {
1498                         err = -ENOMEM;
1499                         goto out_child;
1500                 }
1501
1502                 /*
1503                  * Some H/W events are generated before COMM event
1504                  * which is emitted during exec(), so perf script
1505                  * cannot see a correct process name for those events.
1506                  * Synthesize COMM event to prevent it.
1507                  */
1508                 tgid = perf_event__synthesize_comm(tool, event,
1509                                                    rec->evlist->workload.pid,
1510                                                    process_synthesized_event,
1511                                                    machine);
1512                 free(event);
1513
1514                 if (tgid == -1)
1515                         goto out_child;
1516
1517                 event = malloc(sizeof(event->namespaces) +
1518                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1519                                machine->id_hdr_size);
1520                 if (event == NULL) {
1521                         err = -ENOMEM;
1522                         goto out_child;
1523                 }
1524
1525                 /*
1526                  * Synthesize NAMESPACES event for the command specified.
1527                  */
1528                 perf_event__synthesize_namespaces(tool, event,
1529                                                   rec->evlist->workload.pid,
1530                                                   tgid, process_synthesized_event,
1531                                                   machine);
1532                 free(event);
1533
1534                 perf_evlist__start_workload(rec->evlist);
1535         }
1536
1537         if (opts->initial_delay) {
1538                 usleep(opts->initial_delay * USEC_PER_MSEC);
1539                 evlist__enable(rec->evlist);
1540         }
1541
1542         trigger_ready(&auxtrace_snapshot_trigger);
1543         trigger_ready(&switch_output_trigger);
1544         perf_hooks__invoke_record_start();
1545         for (;;) {
1546                 unsigned long long hits = rec->samples;
1547
1548                 /*
1549                  * rec->evlist->bkw_mmap_state is possible to be
1550                  * BKW_MMAP_EMPTY here: when done == true and
1551                  * hits != rec->samples in previous round.
1552                  *
1553                  * perf_evlist__toggle_bkw_mmap ensure we never
1554                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1555                  */
1556                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1557                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1558
1559                 if (record__mmap_read_all(rec, false) < 0) {
1560                         trigger_error(&auxtrace_snapshot_trigger);
1561                         trigger_error(&switch_output_trigger);
1562                         err = -1;
1563                         goto out_child;
1564                 }
1565
1566                 if (auxtrace_record__snapshot_started) {
1567                         auxtrace_record__snapshot_started = 0;
1568                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
1569                                 record__read_auxtrace_snapshot(rec, false);
1570                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1571                                 pr_err("AUX area tracing snapshot failed\n");
1572                                 err = -1;
1573                                 goto out_child;
1574                         }
1575                 }
1576
1577                 if (trigger_is_hit(&switch_output_trigger)) {
1578                         /*
1579                          * If switch_output_trigger is hit, the data in
1580                          * overwritable ring buffer should have been collected,
1581                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1582                          *
1583                          * If SIGUSR2 raise after or during record__mmap_read_all(),
1584                          * record__mmap_read_all() didn't collect data from
1585                          * overwritable ring buffer. Read again.
1586                          */
1587                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1588                                 continue;
1589                         trigger_ready(&switch_output_trigger);
1590
1591                         /*
1592                          * Reenable events in overwrite ring buffer after
1593                          * record__mmap_read_all(): we should have collected
1594                          * data from it.
1595                          */
1596                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1597
1598                         if (!quiet)
1599                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1600                                         waking);
1601                         waking = 0;
1602                         fd = record__switch_output(rec, false);
1603                         if (fd < 0) {
1604                                 pr_err("Failed to switch to new file\n");
1605                                 trigger_error(&switch_output_trigger);
1606                                 err = fd;
1607                                 goto out_child;
1608                         }
1609
1610                         /* re-arm the alarm */
1611                         if (rec->switch_output.time)
1612                                 alarm(rec->switch_output.time);
1613                 }
1614
1615                 if (hits == rec->samples) {
1616                         if (done || draining)
1617                                 break;
1618                         err = perf_evlist__poll(rec->evlist, -1);
1619                         /*
1620                          * Propagate error, only if there's any. Ignore positive
1621                          * number of returned events and interrupt error.
1622                          */
1623                         if (err > 0 || (err < 0 && errno == EINTR))
1624                                 err = 0;
1625                         waking++;
1626
1627                         if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1628                                 draining = true;
1629                 }
1630
1631                 /*
1632                  * When perf is starting the traced process, at the end events
1633                  * die with the process and we wait for that. Thus no need to
1634                  * disable events in this case.
1635                  */
1636                 if (done && !disabled && !target__none(&opts->target)) {
1637                         trigger_off(&auxtrace_snapshot_trigger);
1638                         evlist__disable(rec->evlist);
1639                         disabled = true;
1640                 }
1641         }
1642
1643         trigger_off(&auxtrace_snapshot_trigger);
1644         trigger_off(&switch_output_trigger);
1645
1646         if (opts->auxtrace_snapshot_on_exit)
1647                 record__auxtrace_snapshot_exit(rec);
1648
1649         if (forks && workload_exec_errno) {
1650                 char msg[STRERR_BUFSIZE];
1651                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1652                 pr_err("Workload failed: %s\n", emsg);
1653                 err = -1;
1654                 goto out_child;
1655         }
1656
1657         if (!quiet)
1658                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1659
1660         if (target__none(&rec->opts.target))
1661                 record__synthesize_workload(rec, true);
1662
1663 out_child:
1664         record__mmap_read_all(rec, true);
1665         record__aio_mmap_read_sync(rec);
1666
1667         if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1668                 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1669                 session->header.env.comp_ratio = ratio + 0.5;
1670         }
1671
1672         if (forks) {
1673                 int exit_status;
1674
1675                 if (!child_finished)
1676                         kill(rec->evlist->workload.pid, SIGTERM);
1677
1678                 wait(&exit_status);
1679
1680                 if (err < 0)
1681                         status = err;
1682                 else if (WIFEXITED(exit_status))
1683                         status = WEXITSTATUS(exit_status);
1684                 else if (WIFSIGNALED(exit_status))
1685                         signr = WTERMSIG(exit_status);
1686         } else
1687                 status = err;
1688
1689         record__synthesize(rec, true);
1690         /* this will be recalculated during process_buildids() */
1691         rec->samples = 0;
1692
1693         if (!err) {
1694                 if (!rec->timestamp_filename) {
1695                         record__finish_output(rec);
1696                 } else {
1697                         fd = record__switch_output(rec, true);
1698                         if (fd < 0) {
1699                                 status = fd;
1700                                 goto out_delete_session;
1701                         }
1702                 }
1703         }
1704
1705         perf_hooks__invoke_record_end();
1706
1707         if (!err && !quiet) {
1708                 char samples[128];
1709                 const char *postfix = rec->timestamp_filename ?
1710                                         ".<timestamp>" : "";
1711
1712                 if (rec->samples && !rec->opts.full_auxtrace)
1713                         scnprintf(samples, sizeof(samples),
1714                                   " (%" PRIu64 " samples)", rec->samples);
1715                 else
1716                         samples[0] = '\0';
1717
1718                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1719                         perf_data__size(data) / 1024.0 / 1024.0,
1720                         data->path, postfix, samples);
1721                 if (ratio) {
1722                         fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1723                                         rec->session->bytes_transferred / 1024.0 / 1024.0,
1724                                         ratio);
1725                 }
1726                 fprintf(stderr, " ]\n");
1727         }
1728
1729 out_delete_session:
1730         zstd_fini(&session->zstd_data);
1731         perf_session__delete(session);
1732
1733         if (!opts->no_bpf_event)
1734                 perf_evlist__stop_sb_thread(sb_evlist);
1735         return status;
1736 }
1737
1738 static void callchain_debug(struct callchain_param *callchain)
1739 {
1740         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1741
1742         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1743
1744         if (callchain->record_mode == CALLCHAIN_DWARF)
1745                 pr_debug("callchain: stack dump size %d\n",
1746                          callchain->dump_size);
1747 }
1748
1749 int record_opts__parse_callchain(struct record_opts *record,
1750                                  struct callchain_param *callchain,
1751                                  const char *arg, bool unset)
1752 {
1753         int ret;
1754         callchain->enabled = !unset;
1755
1756         /* --no-call-graph */
1757         if (unset) {
1758                 callchain->record_mode = CALLCHAIN_NONE;
1759                 pr_debug("callchain: disabled\n");
1760                 return 0;
1761         }
1762
1763         ret = parse_callchain_record_opt(arg, callchain);
1764         if (!ret) {
1765                 /* Enable data address sampling for DWARF unwind. */
1766                 if (callchain->record_mode == CALLCHAIN_DWARF)
1767                         record->sample_address = true;
1768                 callchain_debug(callchain);
1769         }
1770
1771         return ret;
1772 }
1773
1774 int record_parse_callchain_opt(const struct option *opt,
1775                                const char *arg,
1776                                int unset)
1777 {
1778         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1779 }
1780
1781 int record_callchain_opt(const struct option *opt,
1782                          const char *arg __maybe_unused,
1783                          int unset __maybe_unused)
1784 {
1785         struct callchain_param *callchain = opt->value;
1786
1787         callchain->enabled = true;
1788
1789         if (callchain->record_mode == CALLCHAIN_NONE)
1790                 callchain->record_mode = CALLCHAIN_FP;
1791
1792         callchain_debug(callchain);
1793         return 0;
1794 }
1795
1796 static int perf_record_config(const char *var, const char *value, void *cb)
1797 {
1798         struct record *rec = cb;
1799
1800         if (!strcmp(var, "record.build-id")) {
1801                 if (!strcmp(value, "cache"))
1802                         rec->no_buildid_cache = false;
1803                 else if (!strcmp(value, "no-cache"))
1804                         rec->no_buildid_cache = true;
1805                 else if (!strcmp(value, "skip"))
1806                         rec->no_buildid = true;
1807                 else
1808                         return -1;
1809                 return 0;
1810         }
1811         if (!strcmp(var, "record.call-graph")) {
1812                 var = "call-graph.record-mode";
1813                 return perf_default_config(var, value, cb);
1814         }
1815 #ifdef HAVE_AIO_SUPPORT
1816         if (!strcmp(var, "record.aio")) {
1817                 rec->opts.nr_cblocks = strtol(value, NULL, 0);
1818                 if (!rec->opts.nr_cblocks)
1819                         rec->opts.nr_cblocks = nr_cblocks_default;
1820         }
1821 #endif
1822
1823         return 0;
1824 }
1825
1826 struct clockid_map {
1827         const char *name;
1828         int clockid;
1829 };
1830
1831 #define CLOCKID_MAP(n, c)       \
1832         { .name = n, .clockid = (c), }
1833
1834 #define CLOCKID_END     { .name = NULL, }
1835
1836
1837 /*
1838  * Add the missing ones, we need to build on many distros...
1839  */
1840 #ifndef CLOCK_MONOTONIC_RAW
1841 #define CLOCK_MONOTONIC_RAW 4
1842 #endif
1843 #ifndef CLOCK_BOOTTIME
1844 #define CLOCK_BOOTTIME 7
1845 #endif
1846 #ifndef CLOCK_TAI
1847 #define CLOCK_TAI 11
1848 #endif
1849
1850 static const struct clockid_map clockids[] = {
1851         /* available for all events, NMI safe */
1852         CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1853         CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1854
1855         /* available for some events */
1856         CLOCKID_MAP("realtime", CLOCK_REALTIME),
1857         CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1858         CLOCKID_MAP("tai", CLOCK_TAI),
1859
1860         /* available for the lazy */
1861         CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1862         CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1863         CLOCKID_MAP("real", CLOCK_REALTIME),
1864         CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1865
1866         CLOCKID_END,
1867 };
1868
1869 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1870 {
1871         struct timespec res;
1872
1873         *res_ns = 0;
1874         if (!clock_getres(clk_id, &res))
1875                 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1876         else
1877                 pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1878
1879         return 0;
1880 }
1881
1882 static int parse_clockid(const struct option *opt, const char *str, int unset)
1883 {
1884         struct record_opts *opts = (struct record_opts *)opt->value;
1885         const struct clockid_map *cm;
1886         const char *ostr = str;
1887
1888         if (unset) {
1889                 opts->use_clockid = 0;
1890                 return 0;
1891         }
1892
1893         /* no arg passed */
1894         if (!str)
1895                 return 0;
1896
1897         /* no setting it twice */
1898         if (opts->use_clockid)
1899                 return -1;
1900
1901         opts->use_clockid = true;
1902
1903         /* if its a number, we're done */
1904         if (sscanf(str, "%d", &opts->clockid) == 1)
1905                 return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1906
1907         /* allow a "CLOCK_" prefix to the name */
1908         if (!strncasecmp(str, "CLOCK_", 6))
1909                 str += 6;
1910
1911         for (cm = clockids; cm->name; cm++) {
1912                 if (!strcasecmp(str, cm->name)) {
1913                         opts->clockid = cm->clockid;
1914                         return get_clockid_res(opts->clockid,
1915                                                &opts->clockid_res_ns);
1916                 }
1917         }
1918
1919         opts->use_clockid = false;
1920         ui__warning("unknown clockid %s, check man page\n", ostr);
1921         return -1;
1922 }
1923
1924 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1925 {
1926         struct record_opts *opts = (struct record_opts *)opt->value;
1927
1928         if (unset || !str)
1929                 return 0;
1930
1931         if (!strcasecmp(str, "node"))
1932                 opts->affinity = PERF_AFFINITY_NODE;
1933         else if (!strcasecmp(str, "cpu"))
1934                 opts->affinity = PERF_AFFINITY_CPU;
1935
1936         return 0;
1937 }
1938
1939 static int record__parse_mmap_pages(const struct option *opt,
1940                                     const char *str,
1941                                     int unset __maybe_unused)
1942 {
1943         struct record_opts *opts = opt->value;
1944         char *s, *p;
1945         unsigned int mmap_pages;
1946         int ret;
1947
1948         if (!str)
1949                 return -EINVAL;
1950
1951         s = strdup(str);
1952         if (!s)
1953                 return -ENOMEM;
1954
1955         p = strchr(s, ',');
1956         if (p)
1957                 *p = '\0';
1958
1959         if (*s) {
1960                 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1961                 if (ret)
1962                         goto out_free;
1963                 opts->mmap_pages = mmap_pages;
1964         }
1965
1966         if (!p) {
1967                 ret = 0;
1968                 goto out_free;
1969         }
1970
1971         ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1972         if (ret)
1973                 goto out_free;
1974
1975         opts->auxtrace_mmap_pages = mmap_pages;
1976
1977 out_free:
1978         free(s);
1979         return ret;
1980 }
1981
1982 static void switch_output_size_warn(struct record *rec)
1983 {
1984         u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
1985         struct switch_output *s = &rec->switch_output;
1986
1987         wakeup_size /= 2;
1988
1989         if (s->size < wakeup_size) {
1990                 char buf[100];
1991
1992                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1993                 pr_warning("WARNING: switch-output data size lower than "
1994                            "wakeup kernel buffer size (%s) "
1995                            "expect bigger perf.data sizes\n", buf);
1996         }
1997 }
1998
1999 static int switch_output_setup(struct record *rec)
2000 {
2001         struct switch_output *s = &rec->switch_output;
2002         static struct parse_tag tags_size[] = {
2003                 { .tag  = 'B', .mult = 1       },
2004                 { .tag  = 'K', .mult = 1 << 10 },
2005                 { .tag  = 'M', .mult = 1 << 20 },
2006                 { .tag  = 'G', .mult = 1 << 30 },
2007                 { .tag  = 0 },
2008         };
2009         static struct parse_tag tags_time[] = {
2010                 { .tag  = 's', .mult = 1        },
2011                 { .tag  = 'm', .mult = 60       },
2012                 { .tag  = 'h', .mult = 60*60    },
2013                 { .tag  = 'd', .mult = 60*60*24 },
2014                 { .tag  = 0 },
2015         };
2016         unsigned long val;
2017
2018         if (!s->set)
2019                 return 0;
2020
2021         if (!strcmp(s->str, "signal")) {
2022                 s->signal = true;
2023                 pr_debug("switch-output with SIGUSR2 signal\n");
2024                 goto enabled;
2025         }
2026
2027         val = parse_tag_value(s->str, tags_size);
2028         if (val != (unsigned long) -1) {
2029                 s->size = val;
2030                 pr_debug("switch-output with %s size threshold\n", s->str);
2031                 goto enabled;
2032         }
2033
2034         val = parse_tag_value(s->str, tags_time);
2035         if (val != (unsigned long) -1) {
2036                 s->time = val;
2037                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2038                          s->str, s->time);
2039                 goto enabled;
2040         }
2041
2042         return -1;
2043
2044 enabled:
2045         rec->timestamp_filename = true;
2046         s->enabled              = true;
2047
2048         if (s->size && !rec->opts.no_buffering)
2049                 switch_output_size_warn(rec);
2050
2051         return 0;
2052 }
2053
2054 static const char * const __record_usage[] = {
2055         "perf record [<options>] [<command>]",
2056         "perf record [<options>] -- <command> [<options>]",
2057         NULL
2058 };
2059 const char * const *record_usage = __record_usage;
2060
2061 /*
2062  * XXX Ideally would be local to cmd_record() and passed to a record__new
2063  * because we need to have access to it in record__exit, that is called
2064  * after cmd_record() exits, but since record_options need to be accessible to
2065  * builtin-script, leave it here.
2066  *
2067  * At least we don't ouch it in all the other functions here directly.
2068  *
2069  * Just say no to tons of global variables, sigh.
2070  */
2071 static struct record record = {
2072         .opts = {
2073                 .sample_time         = true,
2074                 .mmap_pages          = UINT_MAX,
2075                 .user_freq           = UINT_MAX,
2076                 .user_interval       = ULLONG_MAX,
2077                 .freq                = 4000,
2078                 .target              = {
2079                         .uses_mmap   = true,
2080                         .default_per_cpu = true,
2081                 },
2082                 .mmap_flush          = MMAP_FLUSH_DEFAULT,
2083         },
2084         .tool = {
2085                 .sample         = process_sample_event,
2086                 .fork           = perf_event__process_fork,
2087                 .exit           = perf_event__process_exit,
2088                 .comm           = perf_event__process_comm,
2089                 .namespaces     = perf_event__process_namespaces,
2090                 .mmap           = perf_event__process_mmap,
2091                 .mmap2          = perf_event__process_mmap2,
2092                 .ordered_events = true,
2093         },
2094 };
2095
2096 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2097         "\n\t\t\t\tDefault: fp";
2098
2099 static bool dry_run;
2100
2101 /*
2102  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2103  * with it and switch to use the library functions in perf_evlist that came
2104  * from builtin-record.c, i.e. use record_opts,
2105  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2106  * using pipes, etc.
2107  */
2108 static struct option __record_options[] = {
2109         OPT_CALLBACK('e', "event", &record.evlist, "event",
2110                      "event selector. use 'perf list' to list available events",
2111                      parse_events_option),
2112         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2113                      "event filter", parse_filter),
2114         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2115                            NULL, "don't record events from perf itself",
2116                            exclude_perf),
2117         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2118                     "record events on existing process id"),
2119         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2120                     "record events on existing thread id"),
2121         OPT_INTEGER('r', "realtime", &record.realtime_prio,
2122                     "collect data with this RT SCHED_FIFO priority"),
2123         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2124                     "collect data without buffering"),
2125         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2126                     "collect raw sample records from all opened counters"),
2127         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2128                             "system-wide collection from all CPUs"),
2129         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2130                     "list of cpus to monitor"),
2131         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2132         OPT_STRING('o', "output", &record.data.path, "file",
2133                     "output file name"),
2134         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2135                         &record.opts.no_inherit_set,
2136                         "child tasks do not inherit counters"),
2137         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2138                     "synthesize non-sample events at the end of output"),
2139         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2140         OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2141         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2142                     "Fail if the specified frequency can't be used"),
2143         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2144                      "profile at this frequency",
2145                       record__parse_freq),
2146         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2147                      "number of mmap data pages and AUX area tracing mmap pages",
2148                      record__parse_mmap_pages),
2149         OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2150                      "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2151                      record__mmap_flush_parse),
2152         OPT_BOOLEAN(0, "group", &record.opts.group,
2153                     "put the counters into a counter group"),
2154         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2155                            NULL, "enables call-graph recording" ,
2156                            &record_callchain_opt),
2157         OPT_CALLBACK(0, "call-graph", &record.opts,
2158                      "record_mode[,record_size]", record_callchain_help,
2159                      &record_parse_callchain_opt),
2160         OPT_INCR('v', "verbose", &verbose,
2161                     "be more verbose (show counter open errors, etc)"),
2162         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2163         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2164                     "per thread counts"),
2165         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2166         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2167                     "Record the sample physical addresses"),
2168         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2169         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2170                         &record.opts.sample_time_set,
2171                         "Record the sample timestamps"),
2172         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2173                         "Record the sample period"),
2174         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2175                     "don't sample"),
2176         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2177                         &record.no_buildid_cache_set,
2178                         "do not update the buildid cache"),
2179         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2180                         &record.no_buildid_set,
2181                         "do not collect buildids in perf.data"),
2182         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2183                      "monitor event in cgroup name only",
2184                      parse_cgroups),
2185         OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2186                   "ms to wait before starting measurement after program start"),
2187         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2188                    "user to profile"),
2189
2190         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2191                      "branch any", "sample any taken branches",
2192                      parse_branch_stack),
2193
2194         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2195                      "branch filter mask", "branch stack filter modes",
2196                      parse_branch_stack),
2197         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2198                     "sample by weight (on special events only)"),
2199         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2200                     "sample transaction flags (special events only)"),
2201         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2202                     "use per-thread mmaps"),
2203         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2204                     "sample selected machine registers on interrupt,"
2205                     " use '-I?' to list register names", parse_intr_regs),
2206         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2207                     "sample selected machine registers on interrupt,"
2208                     " use '--user-regs=?' to list register names", parse_user_regs),
2209         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2210                     "Record running/enabled time of read (:S) events"),
2211         OPT_CALLBACK('k', "clockid", &record.opts,
2212         "clockid", "clockid to use for events, see clock_gettime()",
2213         parse_clockid),
2214         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2215                           "opts", "AUX area tracing Snapshot Mode", ""),
2216         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2217                         "per thread proc mmap processing timeout in ms"),
2218         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2219                     "Record namespaces events"),
2220         OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2221                     "Record context switch events"),
2222         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2223                          "Configure all used events to run in kernel space.",
2224                          PARSE_OPT_EXCLUSIVE),
2225         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2226                          "Configure all used events to run in user space.",
2227                          PARSE_OPT_EXCLUSIVE),
2228         OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2229                     "collect kernel callchains"),
2230         OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2231                     "collect user callchains"),
2232         OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2233                    "clang binary to use for compiling BPF scriptlets"),
2234         OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2235                    "options passed to clang when compiling BPF scriptlets"),
2236         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2237                    "file", "vmlinux pathname"),
2238         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2239                     "Record build-id of all DSOs regardless of hits"),
2240         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2241                     "append timestamp to output filename"),
2242         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2243                     "Record timestamp boundary (time of first/last samples)"),
2244         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2245                           &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2246                           "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2247                           "signal"),
2248         OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2249                    "Limit number of switch output generated files"),
2250         OPT_BOOLEAN(0, "dry-run", &dry_run,
2251                     "Parse options then exit"),
2252 #ifdef HAVE_AIO_SUPPORT
2253         OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2254                      &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2255                      record__aio_parse),
2256 #endif
2257         OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2258                      "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2259                      record__parse_affinity),
2260 #ifdef HAVE_ZSTD_SUPPORT
2261         OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2262                             "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2263                             record__parse_comp_level),
2264 #endif
2265         OPT_END()
2266 };
2267
2268 struct option *record_options = __record_options;
2269
2270 int cmd_record(int argc, const char **argv)
2271 {
2272         int err;
2273         struct record *rec = &record;
2274         char errbuf[BUFSIZ];
2275
2276         setlocale(LC_ALL, "");
2277
2278 #ifndef HAVE_LIBBPF_SUPPORT
2279 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2280         set_nobuild('\0', "clang-path", true);
2281         set_nobuild('\0', "clang-opt", true);
2282 # undef set_nobuild
2283 #endif
2284
2285 #ifndef HAVE_BPF_PROLOGUE
2286 # if !defined (HAVE_DWARF_SUPPORT)
2287 #  define REASON  "NO_DWARF=1"
2288 # elif !defined (HAVE_LIBBPF_SUPPORT)
2289 #  define REASON  "NO_LIBBPF=1"
2290 # else
2291 #  define REASON  "this architecture doesn't support BPF prologue"
2292 # endif
2293 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2294         set_nobuild('\0', "vmlinux", true);
2295 # undef set_nobuild
2296 # undef REASON
2297 #endif
2298
2299         CPU_ZERO(&rec->affinity_mask);
2300         rec->opts.affinity = PERF_AFFINITY_SYS;
2301
2302         rec->evlist = evlist__new();
2303         if (rec->evlist == NULL)
2304                 return -ENOMEM;
2305
2306         err = perf_config(perf_record_config, rec);
2307         if (err)
2308                 return err;
2309
2310         argc = parse_options(argc, argv, record_options, record_usage,
2311                             PARSE_OPT_STOP_AT_NON_OPTION);
2312         if (quiet)
2313                 perf_quiet_option();
2314
2315         /* Make system wide (-a) the default target. */
2316         if (!argc && target__none(&rec->opts.target))
2317                 rec->opts.target.system_wide = true;
2318
2319         if (nr_cgroups && !rec->opts.target.system_wide) {
2320                 usage_with_options_msg(record_usage, record_options,
2321                         "cgroup monitoring only available in system-wide mode");
2322
2323         }
2324
2325         if (rec->opts.comp_level != 0) {
2326                 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2327                 rec->no_buildid = true;
2328         }
2329
2330         if (rec->opts.record_switch_events &&
2331             !perf_can_record_switch_events()) {
2332                 ui__error("kernel does not support recording context switch events\n");
2333                 parse_options_usage(record_usage, record_options, "switch-events", 0);
2334                 return -EINVAL;
2335         }
2336
2337         if (switch_output_setup(rec)) {
2338                 parse_options_usage(record_usage, record_options, "switch-output", 0);
2339                 return -EINVAL;
2340         }
2341
2342         if (rec->switch_output.time) {
2343                 signal(SIGALRM, alarm_sig_handler);
2344                 alarm(rec->switch_output.time);
2345         }
2346
2347         if (rec->switch_output.num_files) {
2348                 rec->switch_output.filenames = calloc(sizeof(char *),
2349                                                       rec->switch_output.num_files);
2350                 if (!rec->switch_output.filenames)
2351                         return -EINVAL;
2352         }
2353
2354         /*
2355          * Allow aliases to facilitate the lookup of symbols for address
2356          * filters. Refer to auxtrace_parse_filters().
2357          */
2358         symbol_conf.allow_aliases = true;
2359
2360         symbol__init(NULL);
2361
2362         err = record__auxtrace_init(rec);
2363         if (err)
2364                 goto out;
2365
2366         if (dry_run)
2367                 goto out;
2368
2369         err = bpf__setup_stdout(rec->evlist);
2370         if (err) {
2371                 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2372                 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2373                          errbuf);
2374                 goto out;
2375         }
2376
2377         err = -ENOMEM;
2378
2379         if (rec->no_buildid_cache || rec->no_buildid) {
2380                 disable_buildid_cache();
2381         } else if (rec->switch_output.enabled) {
2382                 /*
2383                  * In 'perf record --switch-output', disable buildid
2384                  * generation by default to reduce data file switching
2385                  * overhead. Still generate buildid if they are required
2386                  * explicitly using
2387                  *
2388                  *  perf record --switch-output --no-no-buildid \
2389                  *              --no-no-buildid-cache
2390                  *
2391                  * Following code equals to:
2392                  *
2393                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
2394                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2395                  *         disable_buildid_cache();
2396                  */
2397                 bool disable = true;
2398
2399                 if (rec->no_buildid_set && !rec->no_buildid)
2400                         disable = false;
2401                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2402                         disable = false;
2403                 if (disable) {
2404                         rec->no_buildid = true;
2405                         rec->no_buildid_cache = true;
2406                         disable_buildid_cache();
2407                 }
2408         }
2409
2410         if (record.opts.overwrite)
2411                 record.opts.tail_synthesize = true;
2412
2413         if (rec->evlist->core.nr_entries == 0 &&
2414             __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2415                 pr_err("Not enough memory for event selector list\n");
2416                 goto out;
2417         }
2418
2419         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2420                 rec->opts.no_inherit = true;
2421
2422         err = target__validate(&rec->opts.target);
2423         if (err) {
2424                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2425                 ui__warning("%s\n", errbuf);
2426         }
2427
2428         err = target__parse_uid(&rec->opts.target);
2429         if (err) {
2430                 int saved_errno = errno;
2431
2432                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2433                 ui__error("%s", errbuf);
2434
2435                 err = -saved_errno;
2436                 goto out;
2437         }
2438
2439         /* Enable ignoring missing threads when -u/-p option is defined. */
2440         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2441
2442         err = -ENOMEM;
2443         if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2444                 usage_with_options(record_usage, record_options);
2445
2446         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2447         if (err)
2448                 goto out;
2449
2450         /*
2451          * We take all buildids when the file contains
2452          * AUX area tracing data because we do not decode the
2453          * trace because it would take too long.
2454          */
2455         if (rec->opts.full_auxtrace)
2456                 rec->buildid_all = true;
2457
2458         if (record_opts__config(&rec->opts)) {
2459                 err = -EINVAL;
2460                 goto out;
2461         }
2462
2463         if (rec->opts.nr_cblocks > nr_cblocks_max)
2464                 rec->opts.nr_cblocks = nr_cblocks_max;
2465         pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2466
2467         pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2468         pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2469
2470         if (rec->opts.comp_level > comp_level_max)
2471                 rec->opts.comp_level = comp_level_max;
2472         pr_debug("comp level: %d\n", rec->opts.comp_level);
2473
2474         err = __cmd_record(&record, argc, argv);
2475 out:
2476         evlist__delete(rec->evlist);
2477         symbol__exit();
2478         auxtrace_record__free(rec->itr);
2479         return err;
2480 }
2481
2482 static void snapshot_sig_handler(int sig __maybe_unused)
2483 {
2484         struct record *rec = &record;
2485
2486         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2487                 trigger_hit(&auxtrace_snapshot_trigger);
2488                 auxtrace_record__snapshot_started = 1;
2489                 if (auxtrace_record__snapshot_start(record.itr))
2490                         trigger_error(&auxtrace_snapshot_trigger);
2491         }
2492
2493         if (switch_output_signal(rec))
2494                 trigger_hit(&switch_output_trigger);
2495 }
2496
2497 static void alarm_sig_handler(int sig __maybe_unused)
2498 {
2499         struct record *rec = &record;
2500
2501         if (switch_output_time(rec))
2502                 trigger_hit(&switch_output_trigger);
2503 }