8779cee58185a05a3e5e74bc9417b08b85fbf390
[linux-2.6-microblaze.git] / tools / perf / builtin-record.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10
11 #include "perf.h"
12
13 #include "util/build-id.h"
14 #include <subcmd/parse-options.h>
15 #include "util/parse-events.h"
16 #include "util/config.h"
17
18 #include "util/callchain.h"
19 #include "util/cgroup.h"
20 #include "util/header.h"
21 #include "util/event.h"
22 #include "util/evlist.h"
23 #include "util/evsel.h"
24 #include "util/debug.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/cpumap.h"
29 #include "util/thread_map.h"
30 #include "util/data.h"
31 #include "util/perf_regs.h"
32 #include "util/auxtrace.h"
33 #include "util/tsc.h"
34 #include "util/parse-branch-options.h"
35 #include "util/parse-regs-options.h"
36 #include "util/llvm-utils.h"
37 #include "util/bpf-loader.h"
38 #include "util/trigger.h"
39 #include "util/perf-hooks.h"
40 #include "util/cpu-set-sched.h"
41 #include "util/time-utils.h"
42 #include "util/units.h"
43 #include "util/bpf-event.h"
44 #include "asm/bug.h"
45
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <locale.h>
49 #include <poll.h>
50 #include <unistd.h>
51 #include <sched.h>
52 #include <signal.h>
53 #include <sys/mman.h>
54 #include <sys/wait.h>
55 #include <linux/time64.h>
56 #include <linux/zalloc.h>
57
58 struct switch_output {
59         bool             enabled;
60         bool             signal;
61         unsigned long    size;
62         unsigned long    time;
63         const char      *str;
64         bool             set;
65         char             **filenames;
66         int              num_files;
67         int              cur_file;
68 };
69
70 struct record {
71         struct perf_tool        tool;
72         struct record_opts      opts;
73         u64                     bytes_written;
74         struct perf_data        data;
75         struct auxtrace_record  *itr;
76         struct perf_evlist      *evlist;
77         struct perf_session     *session;
78         int                     realtime_prio;
79         bool                    no_buildid;
80         bool                    no_buildid_set;
81         bool                    no_buildid_cache;
82         bool                    no_buildid_cache_set;
83         bool                    buildid_all;
84         bool                    timestamp_filename;
85         bool                    timestamp_boundary;
86         struct switch_output    switch_output;
87         unsigned long long      samples;
88         cpu_set_t               affinity_mask;
89 };
90
91 static volatile int auxtrace_record__snapshot_started;
92 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
93 static DEFINE_TRIGGER(switch_output_trigger);
94
95 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
96         "SYS", "NODE", "CPU"
97 };
98
99 static bool switch_output_signal(struct record *rec)
100 {
101         return rec->switch_output.signal &&
102                trigger_is_ready(&switch_output_trigger);
103 }
104
105 static bool switch_output_size(struct record *rec)
106 {
107         return rec->switch_output.size &&
108                trigger_is_ready(&switch_output_trigger) &&
109                (rec->bytes_written >= rec->switch_output.size);
110 }
111
112 static bool switch_output_time(struct record *rec)
113 {
114         return rec->switch_output.time &&
115                trigger_is_ready(&switch_output_trigger);
116 }
117
118 static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused,
119                          void *bf, size_t size)
120 {
121         struct perf_data_file *file = &rec->session->data->file;
122
123         if (perf_data_file__write(file, bf, size) < 0) {
124                 pr_err("failed to write perf data, error: %m\n");
125                 return -1;
126         }
127
128         rec->bytes_written += size;
129
130         if (switch_output_size(rec))
131                 trigger_hit(&switch_output_trigger);
132
133         return 0;
134 }
135
136 static int record__aio_enabled(struct record *rec);
137 static int record__comp_enabled(struct record *rec);
138 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
139                             void *src, size_t src_size);
140
141 #ifdef HAVE_AIO_SUPPORT
142 static int record__aio_write(struct aiocb *cblock, int trace_fd,
143                 void *buf, size_t size, off_t off)
144 {
145         int rc;
146
147         cblock->aio_fildes = trace_fd;
148         cblock->aio_buf    = buf;
149         cblock->aio_nbytes = size;
150         cblock->aio_offset = off;
151         cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
152
153         do {
154                 rc = aio_write(cblock);
155                 if (rc == 0) {
156                         break;
157                 } else if (errno != EAGAIN) {
158                         cblock->aio_fildes = -1;
159                         pr_err("failed to queue perf data, error: %m\n");
160                         break;
161                 }
162         } while (1);
163
164         return rc;
165 }
166
167 static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
168 {
169         void *rem_buf;
170         off_t rem_off;
171         size_t rem_size;
172         int rc, aio_errno;
173         ssize_t aio_ret, written;
174
175         aio_errno = aio_error(cblock);
176         if (aio_errno == EINPROGRESS)
177                 return 0;
178
179         written = aio_ret = aio_return(cblock);
180         if (aio_ret < 0) {
181                 if (aio_errno != EINTR)
182                         pr_err("failed to write perf data, error: %m\n");
183                 written = 0;
184         }
185
186         rem_size = cblock->aio_nbytes - written;
187
188         if (rem_size == 0) {
189                 cblock->aio_fildes = -1;
190                 /*
191                  * md->refcount is incremented in record__aio_pushfn() for
192                  * every aio write request started in record__aio_push() so
193                  * decrement it because the request is now complete.
194                  */
195                 perf_mmap__put(md);
196                 rc = 1;
197         } else {
198                 /*
199                  * aio write request may require restart with the
200                  * reminder if the kernel didn't write whole
201                  * chunk at once.
202                  */
203                 rem_off = cblock->aio_offset + written;
204                 rem_buf = (void *)(cblock->aio_buf + written);
205                 record__aio_write(cblock, cblock->aio_fildes,
206                                 rem_buf, rem_size, rem_off);
207                 rc = 0;
208         }
209
210         return rc;
211 }
212
213 static int record__aio_sync(struct perf_mmap *md, bool sync_all)
214 {
215         struct aiocb **aiocb = md->aio.aiocb;
216         struct aiocb *cblocks = md->aio.cblocks;
217         struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
218         int i, do_suspend;
219
220         do {
221                 do_suspend = 0;
222                 for (i = 0; i < md->aio.nr_cblocks; ++i) {
223                         if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
224                                 if (sync_all)
225                                         aiocb[i] = NULL;
226                                 else
227                                         return i;
228                         } else {
229                                 /*
230                                  * Started aio write is not complete yet
231                                  * so it has to be waited before the
232                                  * next allocation.
233                                  */
234                                 aiocb[i] = &cblocks[i];
235                                 do_suspend = 1;
236                         }
237                 }
238                 if (!do_suspend)
239                         return -1;
240
241                 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
242                         if (!(errno == EAGAIN || errno == EINTR))
243                                 pr_err("failed to sync perf data, error: %m\n");
244                 }
245         } while (1);
246 }
247
248 struct record_aio {
249         struct record   *rec;
250         void            *data;
251         size_t          size;
252 };
253
254 static int record__aio_pushfn(struct perf_mmap *map, void *to, void *buf, size_t size)
255 {
256         struct record_aio *aio = to;
257
258         /*
259          * map->base data pointed by buf is copied into free map->aio.data[] buffer
260          * to release space in the kernel buffer as fast as possible, calling
261          * perf_mmap__consume() from perf_mmap__push() function.
262          *
263          * That lets the kernel to proceed with storing more profiling data into
264          * the kernel buffer earlier than other per-cpu kernel buffers are handled.
265          *
266          * Coping can be done in two steps in case the chunk of profiling data
267          * crosses the upper bound of the kernel buffer. In this case we first move
268          * part of data from map->start till the upper bound and then the reminder
269          * from the beginning of the kernel buffer till the end of the data chunk.
270          */
271
272         if (record__comp_enabled(aio->rec)) {
273                 size = zstd_compress(aio->rec->session, aio->data + aio->size,
274                                      perf_mmap__mmap_len(map) - aio->size,
275                                      buf, size);
276         } else {
277                 memcpy(aio->data + aio->size, buf, size);
278         }
279
280         if (!aio->size) {
281                 /*
282                  * Increment map->refcount to guard map->aio.data[] buffer
283                  * from premature deallocation because map object can be
284                  * released earlier than aio write request started on
285                  * map->aio.data[] buffer is complete.
286                  *
287                  * perf_mmap__put() is done at record__aio_complete()
288                  * after started aio request completion or at record__aio_push()
289                  * if the request failed to start.
290                  */
291                 perf_mmap__get(map);
292         }
293
294         aio->size += size;
295
296         return size;
297 }
298
299 static int record__aio_push(struct record *rec, struct perf_mmap *map, off_t *off)
300 {
301         int ret, idx;
302         int trace_fd = rec->session->data->file.fd;
303         struct record_aio aio = { .rec = rec, .size = 0 };
304
305         /*
306          * Call record__aio_sync() to wait till map->aio.data[] buffer
307          * becomes available after previous aio write operation.
308          */
309
310         idx = record__aio_sync(map, false);
311         aio.data = map->aio.data[idx];
312         ret = perf_mmap__push(map, &aio, record__aio_pushfn);
313         if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
314                 return ret;
315
316         rec->samples++;
317         ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
318         if (!ret) {
319                 *off += aio.size;
320                 rec->bytes_written += aio.size;
321                 if (switch_output_size(rec))
322                         trigger_hit(&switch_output_trigger);
323         } else {
324                 /*
325                  * Decrement map->refcount incremented in record__aio_pushfn()
326                  * back if record__aio_write() operation failed to start, otherwise
327                  * map->refcount is decremented in record__aio_complete() after
328                  * aio write operation finishes successfully.
329                  */
330                 perf_mmap__put(map);
331         }
332
333         return ret;
334 }
335
336 static off_t record__aio_get_pos(int trace_fd)
337 {
338         return lseek(trace_fd, 0, SEEK_CUR);
339 }
340
341 static void record__aio_set_pos(int trace_fd, off_t pos)
342 {
343         lseek(trace_fd, pos, SEEK_SET);
344 }
345
346 static void record__aio_mmap_read_sync(struct record *rec)
347 {
348         int i;
349         struct perf_evlist *evlist = rec->evlist;
350         struct perf_mmap *maps = evlist->mmap;
351
352         if (!record__aio_enabled(rec))
353                 return;
354
355         for (i = 0; i < evlist->nr_mmaps; i++) {
356                 struct perf_mmap *map = &maps[i];
357
358                 if (map->base)
359                         record__aio_sync(map, true);
360         }
361 }
362
363 static int nr_cblocks_default = 1;
364 static int nr_cblocks_max = 4;
365
366 static int record__aio_parse(const struct option *opt,
367                              const char *str,
368                              int unset)
369 {
370         struct record_opts *opts = (struct record_opts *)opt->value;
371
372         if (unset) {
373                 opts->nr_cblocks = 0;
374         } else {
375                 if (str)
376                         opts->nr_cblocks = strtol(str, NULL, 0);
377                 if (!opts->nr_cblocks)
378                         opts->nr_cblocks = nr_cblocks_default;
379         }
380
381         return 0;
382 }
383 #else /* HAVE_AIO_SUPPORT */
384 static int nr_cblocks_max = 0;
385
386 static int record__aio_push(struct record *rec __maybe_unused, struct perf_mmap *map __maybe_unused,
387                             off_t *off __maybe_unused)
388 {
389         return -1;
390 }
391
392 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
393 {
394         return -1;
395 }
396
397 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
398 {
399 }
400
401 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
402 {
403 }
404 #endif
405
406 static int record__aio_enabled(struct record *rec)
407 {
408         return rec->opts.nr_cblocks > 0;
409 }
410
411 #define MMAP_FLUSH_DEFAULT 1
412 static int record__mmap_flush_parse(const struct option *opt,
413                                     const char *str,
414                                     int unset)
415 {
416         int flush_max;
417         struct record_opts *opts = (struct record_opts *)opt->value;
418         static struct parse_tag tags[] = {
419                         { .tag  = 'B', .mult = 1       },
420                         { .tag  = 'K', .mult = 1 << 10 },
421                         { .tag  = 'M', .mult = 1 << 20 },
422                         { .tag  = 'G', .mult = 1 << 30 },
423                         { .tag  = 0 },
424         };
425
426         if (unset)
427                 return 0;
428
429         if (str) {
430                 opts->mmap_flush = parse_tag_value(str, tags);
431                 if (opts->mmap_flush == (int)-1)
432                         opts->mmap_flush = strtol(str, NULL, 0);
433         }
434
435         if (!opts->mmap_flush)
436                 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
437
438         flush_max = perf_evlist__mmap_size(opts->mmap_pages);
439         flush_max /= 4;
440         if (opts->mmap_flush > flush_max)
441                 opts->mmap_flush = flush_max;
442
443         return 0;
444 }
445
446 #ifdef HAVE_ZSTD_SUPPORT
447 static unsigned int comp_level_default = 1;
448
449 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
450 {
451         struct record_opts *opts = opt->value;
452
453         if (unset) {
454                 opts->comp_level = 0;
455         } else {
456                 if (str)
457                         opts->comp_level = strtol(str, NULL, 0);
458                 if (!opts->comp_level)
459                         opts->comp_level = comp_level_default;
460         }
461
462         return 0;
463 }
464 #endif
465 static unsigned int comp_level_max = 22;
466
467 static int record__comp_enabled(struct record *rec)
468 {
469         return rec->opts.comp_level > 0;
470 }
471
472 static int process_synthesized_event(struct perf_tool *tool,
473                                      union perf_event *event,
474                                      struct perf_sample *sample __maybe_unused,
475                                      struct machine *machine __maybe_unused)
476 {
477         struct record *rec = container_of(tool, struct record, tool);
478         return record__write(rec, NULL, event, event->header.size);
479 }
480
481 static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
482 {
483         struct record *rec = to;
484
485         if (record__comp_enabled(rec)) {
486                 size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
487                 bf   = map->data;
488         }
489
490         rec->samples++;
491         return record__write(rec, map, bf, size);
492 }
493
494 static volatile int done;
495 static volatile int signr = -1;
496 static volatile int child_finished;
497
498 static void sig_handler(int sig)
499 {
500         if (sig == SIGCHLD)
501                 child_finished = 1;
502         else
503                 signr = sig;
504
505         done = 1;
506 }
507
508 static void sigsegv_handler(int sig)
509 {
510         perf_hooks__recover();
511         sighandler_dump_stack(sig);
512 }
513
514 static void record__sig_exit(void)
515 {
516         if (signr == -1)
517                 return;
518
519         signal(signr, SIG_DFL);
520         raise(signr);
521 }
522
523 #ifdef HAVE_AUXTRACE_SUPPORT
524
525 static int record__process_auxtrace(struct perf_tool *tool,
526                                     struct perf_mmap *map,
527                                     union perf_event *event, void *data1,
528                                     size_t len1, void *data2, size_t len2)
529 {
530         struct record *rec = container_of(tool, struct record, tool);
531         struct perf_data *data = &rec->data;
532         size_t padding;
533         u8 pad[8] = {0};
534
535         if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
536                 off_t file_offset;
537                 int fd = perf_data__fd(data);
538                 int err;
539
540                 file_offset = lseek(fd, 0, SEEK_CUR);
541                 if (file_offset == -1)
542                         return -1;
543                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
544                                                      event, file_offset);
545                 if (err)
546                         return err;
547         }
548
549         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
550         padding = (len1 + len2) & 7;
551         if (padding)
552                 padding = 8 - padding;
553
554         record__write(rec, map, event, event->header.size);
555         record__write(rec, map, data1, len1);
556         if (len2)
557                 record__write(rec, map, data2, len2);
558         record__write(rec, map, &pad, padding);
559
560         return 0;
561 }
562
563 static int record__auxtrace_mmap_read(struct record *rec,
564                                       struct perf_mmap *map)
565 {
566         int ret;
567
568         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
569                                   record__process_auxtrace);
570         if (ret < 0)
571                 return ret;
572
573         if (ret)
574                 rec->samples++;
575
576         return 0;
577 }
578
579 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
580                                                struct perf_mmap *map)
581 {
582         int ret;
583
584         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
585                                            record__process_auxtrace,
586                                            rec->opts.auxtrace_snapshot_size);
587         if (ret < 0)
588                 return ret;
589
590         if (ret)
591                 rec->samples++;
592
593         return 0;
594 }
595
596 static int record__auxtrace_read_snapshot_all(struct record *rec)
597 {
598         int i;
599         int rc = 0;
600
601         for (i = 0; i < rec->evlist->nr_mmaps; i++) {
602                 struct perf_mmap *map = &rec->evlist->mmap[i];
603
604                 if (!map->auxtrace_mmap.base)
605                         continue;
606
607                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
608                         rc = -1;
609                         goto out;
610                 }
611         }
612 out:
613         return rc;
614 }
615
616 static void record__read_auxtrace_snapshot(struct record *rec)
617 {
618         pr_debug("Recording AUX area tracing snapshot\n");
619         if (record__auxtrace_read_snapshot_all(rec) < 0) {
620                 trigger_error(&auxtrace_snapshot_trigger);
621         } else {
622                 if (auxtrace_record__snapshot_finish(rec->itr))
623                         trigger_error(&auxtrace_snapshot_trigger);
624                 else
625                         trigger_ready(&auxtrace_snapshot_trigger);
626         }
627 }
628
629 static int record__auxtrace_init(struct record *rec)
630 {
631         int err;
632
633         if (!rec->itr) {
634                 rec->itr = auxtrace_record__init(rec->evlist, &err);
635                 if (err)
636                         return err;
637         }
638
639         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
640                                               rec->opts.auxtrace_snapshot_opts);
641         if (err)
642                 return err;
643
644         return auxtrace_parse_filters(rec->evlist);
645 }
646
647 #else
648
649 static inline
650 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
651                                struct perf_mmap *map __maybe_unused)
652 {
653         return 0;
654 }
655
656 static inline
657 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
658 {
659 }
660
661 static inline
662 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
663 {
664         return 0;
665 }
666
667 static int record__auxtrace_init(struct record *rec __maybe_unused)
668 {
669         return 0;
670 }
671
672 #endif
673
674 static int record__mmap_evlist(struct record *rec,
675                                struct perf_evlist *evlist)
676 {
677         struct record_opts *opts = &rec->opts;
678         char msg[512];
679
680         if (opts->affinity != PERF_AFFINITY_SYS)
681                 cpu__setup_cpunode_map();
682
683         if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
684                                  opts->auxtrace_mmap_pages,
685                                  opts->auxtrace_snapshot_mode,
686                                  opts->nr_cblocks, opts->affinity,
687                                  opts->mmap_flush, opts->comp_level) < 0) {
688                 if (errno == EPERM) {
689                         pr_err("Permission error mapping pages.\n"
690                                "Consider increasing "
691                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
692                                "or try again with a smaller value of -m/--mmap_pages.\n"
693                                "(current value: %u,%u)\n",
694                                opts->mmap_pages, opts->auxtrace_mmap_pages);
695                         return -errno;
696                 } else {
697                         pr_err("failed to mmap with %d (%s)\n", errno,
698                                 str_error_r(errno, msg, sizeof(msg)));
699                         if (errno)
700                                 return -errno;
701                         else
702                                 return -EINVAL;
703                 }
704         }
705         return 0;
706 }
707
708 static int record__mmap(struct record *rec)
709 {
710         return record__mmap_evlist(rec, rec->evlist);
711 }
712
713 static int record__open(struct record *rec)
714 {
715         char msg[BUFSIZ];
716         struct perf_evsel *pos;
717         struct perf_evlist *evlist = rec->evlist;
718         struct perf_session *session = rec->session;
719         struct record_opts *opts = &rec->opts;
720         int rc = 0;
721
722         /*
723          * For initial_delay we need to add a dummy event so that we can track
724          * PERF_RECORD_MMAP while we wait for the initial delay to enable the
725          * real events, the ones asked by the user.
726          */
727         if (opts->initial_delay) {
728                 if (perf_evlist__add_dummy(evlist))
729                         return -ENOMEM;
730
731                 pos = perf_evlist__first(evlist);
732                 pos->tracking = 0;
733                 pos = perf_evlist__last(evlist);
734                 pos->tracking = 1;
735                 pos->attr.enable_on_exec = 1;
736         }
737
738         perf_evlist__config(evlist, opts, &callchain_param);
739
740         evlist__for_each_entry(evlist, pos) {
741 try_again:
742                 if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
743                         if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
744                                 if (verbose > 0)
745                                         ui__warning("%s\n", msg);
746                                 goto try_again;
747                         }
748                         if ((errno == EINVAL || errno == EBADF) &&
749                             pos->leader != pos &&
750                             pos->weak_group) {
751                                 pos = perf_evlist__reset_weak_group(evlist, pos);
752                                 goto try_again;
753                         }
754                         rc = -errno;
755                         perf_evsel__open_strerror(pos, &opts->target,
756                                                   errno, msg, sizeof(msg));
757                         ui__error("%s\n", msg);
758                         goto out;
759                 }
760
761                 pos->supported = true;
762         }
763
764         if (perf_evlist__apply_filters(evlist, &pos)) {
765                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
766                         pos->filter, perf_evsel__name(pos), errno,
767                         str_error_r(errno, msg, sizeof(msg)));
768                 rc = -1;
769                 goto out;
770         }
771
772         rc = record__mmap(rec);
773         if (rc)
774                 goto out;
775
776         session->evlist = evlist;
777         perf_session__set_id_hdr_size(session);
778 out:
779         return rc;
780 }
781
782 static int process_sample_event(struct perf_tool *tool,
783                                 union perf_event *event,
784                                 struct perf_sample *sample,
785                                 struct perf_evsel *evsel,
786                                 struct machine *machine)
787 {
788         struct record *rec = container_of(tool, struct record, tool);
789
790         if (rec->evlist->first_sample_time == 0)
791                 rec->evlist->first_sample_time = sample->time;
792
793         rec->evlist->last_sample_time = sample->time;
794
795         if (rec->buildid_all)
796                 return 0;
797
798         rec->samples++;
799         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
800 }
801
802 static int process_buildids(struct record *rec)
803 {
804         struct perf_session *session = rec->session;
805
806         if (perf_data__size(&rec->data) == 0)
807                 return 0;
808
809         /*
810          * During this process, it'll load kernel map and replace the
811          * dso->long_name to a real pathname it found.  In this case
812          * we prefer the vmlinux path like
813          *   /lib/modules/3.16.4/build/vmlinux
814          *
815          * rather than build-id path (in debug directory).
816          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
817          */
818         symbol_conf.ignore_vmlinux_buildid = true;
819
820         /*
821          * If --buildid-all is given, it marks all DSO regardless of hits,
822          * so no need to process samples. But if timestamp_boundary is enabled,
823          * it still needs to walk on all samples to get the timestamps of
824          * first/last samples.
825          */
826         if (rec->buildid_all && !rec->timestamp_boundary)
827                 rec->tool.sample = NULL;
828
829         return perf_session__process_events(session);
830 }
831
832 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
833 {
834         int err;
835         struct perf_tool *tool = data;
836         /*
837          *As for guest kernel when processing subcommand record&report,
838          *we arrange module mmap prior to guest kernel mmap and trigger
839          *a preload dso because default guest module symbols are loaded
840          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
841          *method is used to avoid symbol missing when the first addr is
842          *in module instead of in guest kernel.
843          */
844         err = perf_event__synthesize_modules(tool, process_synthesized_event,
845                                              machine);
846         if (err < 0)
847                 pr_err("Couldn't record guest kernel [%d]'s reference"
848                        " relocation symbol.\n", machine->pid);
849
850         /*
851          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
852          * have no _text sometimes.
853          */
854         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
855                                                  machine);
856         if (err < 0)
857                 pr_err("Couldn't record guest kernel [%d]'s reference"
858                        " relocation symbol.\n", machine->pid);
859 }
860
861 static struct perf_event_header finished_round_event = {
862         .size = sizeof(struct perf_event_header),
863         .type = PERF_RECORD_FINISHED_ROUND,
864 };
865
866 static void record__adjust_affinity(struct record *rec, struct perf_mmap *map)
867 {
868         if (rec->opts.affinity != PERF_AFFINITY_SYS &&
869             !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
870                 CPU_ZERO(&rec->affinity_mask);
871                 CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
872                 sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
873         }
874 }
875
876 static size_t process_comp_header(void *record, size_t increment)
877 {
878         struct compressed_event *event = record;
879         size_t size = sizeof(*event);
880
881         if (increment) {
882                 event->header.size += increment;
883                 return increment;
884         }
885
886         event->header.type = PERF_RECORD_COMPRESSED;
887         event->header.size = size;
888
889         return size;
890 }
891
892 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
893                             void *src, size_t src_size)
894 {
895         size_t compressed;
896         size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct compressed_event) - 1;
897
898         compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
899                                                      max_record_size, process_comp_header);
900
901         session->bytes_transferred += src_size;
902         session->bytes_compressed  += compressed;
903
904         return compressed;
905 }
906
907 static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist,
908                                     bool overwrite, bool synch)
909 {
910         u64 bytes_written = rec->bytes_written;
911         int i;
912         int rc = 0;
913         struct perf_mmap *maps;
914         int trace_fd = rec->data.file.fd;
915         off_t off = 0;
916
917         if (!evlist)
918                 return 0;
919
920         maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
921         if (!maps)
922                 return 0;
923
924         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
925                 return 0;
926
927         if (record__aio_enabled(rec))
928                 off = record__aio_get_pos(trace_fd);
929
930         for (i = 0; i < evlist->nr_mmaps; i++) {
931                 u64 flush = 0;
932                 struct perf_mmap *map = &maps[i];
933
934                 if (map->base) {
935                         record__adjust_affinity(rec, map);
936                         if (synch) {
937                                 flush = map->flush;
938                                 map->flush = 1;
939                         }
940                         if (!record__aio_enabled(rec)) {
941                                 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
942                                         if (synch)
943                                                 map->flush = flush;
944                                         rc = -1;
945                                         goto out;
946                                 }
947                         } else {
948                                 if (record__aio_push(rec, map, &off) < 0) {
949                                         record__aio_set_pos(trace_fd, off);
950                                         if (synch)
951                                                 map->flush = flush;
952                                         rc = -1;
953                                         goto out;
954                                 }
955                         }
956                         if (synch)
957                                 map->flush = flush;
958                 }
959
960                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
961                     record__auxtrace_mmap_read(rec, map) != 0) {
962                         rc = -1;
963                         goto out;
964                 }
965         }
966
967         if (record__aio_enabled(rec))
968                 record__aio_set_pos(trace_fd, off);
969
970         /*
971          * Mark the round finished in case we wrote
972          * at least one event.
973          */
974         if (bytes_written != rec->bytes_written)
975                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
976
977         if (overwrite)
978                 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
979 out:
980         return rc;
981 }
982
983 static int record__mmap_read_all(struct record *rec, bool synch)
984 {
985         int err;
986
987         err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
988         if (err)
989                 return err;
990
991         return record__mmap_read_evlist(rec, rec->evlist, true, synch);
992 }
993
994 static void record__init_features(struct record *rec)
995 {
996         struct perf_session *session = rec->session;
997         int feat;
998
999         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1000                 perf_header__set_feat(&session->header, feat);
1001
1002         if (rec->no_buildid)
1003                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1004
1005         if (!have_tracepoints(&rec->evlist->entries))
1006                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1007
1008         if (!rec->opts.branch_stack)
1009                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1010
1011         if (!rec->opts.full_auxtrace)
1012                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1013
1014         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1015                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1016
1017         perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1018         if (!record__comp_enabled(rec))
1019                 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1020
1021         perf_header__clear_feat(&session->header, HEADER_STAT);
1022 }
1023
1024 static void
1025 record__finish_output(struct record *rec)
1026 {
1027         struct perf_data *data = &rec->data;
1028         int fd = perf_data__fd(data);
1029
1030         if (data->is_pipe)
1031                 return;
1032
1033         rec->session->header.data_size += rec->bytes_written;
1034         data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1035
1036         if (!rec->no_buildid) {
1037                 process_buildids(rec);
1038
1039                 if (rec->buildid_all)
1040                         dsos__hit_all(rec->session);
1041         }
1042         perf_session__write_header(rec->session, rec->evlist, fd, true);
1043
1044         return;
1045 }
1046
1047 static int record__synthesize_workload(struct record *rec, bool tail)
1048 {
1049         int err;
1050         struct thread_map *thread_map;
1051
1052         if (rec->opts.tail_synthesize != tail)
1053                 return 0;
1054
1055         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1056         if (thread_map == NULL)
1057                 return -1;
1058
1059         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1060                                                  process_synthesized_event,
1061                                                  &rec->session->machines.host,
1062                                                  rec->opts.sample_address);
1063         thread_map__put(thread_map);
1064         return err;
1065 }
1066
1067 static int record__synthesize(struct record *rec, bool tail);
1068
1069 static int
1070 record__switch_output(struct record *rec, bool at_exit)
1071 {
1072         struct perf_data *data = &rec->data;
1073         int fd, err;
1074         char *new_filename;
1075
1076         /* Same Size:      "2015122520103046"*/
1077         char timestamp[] = "InvalidTimestamp";
1078
1079         record__aio_mmap_read_sync(rec);
1080
1081         record__synthesize(rec, true);
1082         if (target__none(&rec->opts.target))
1083                 record__synthesize_workload(rec, true);
1084
1085         rec->samples = 0;
1086         record__finish_output(rec);
1087         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1088         if (err) {
1089                 pr_err("Failed to get current timestamp\n");
1090                 return -EINVAL;
1091         }
1092
1093         fd = perf_data__switch(data, timestamp,
1094                                     rec->session->header.data_offset,
1095                                     at_exit, &new_filename);
1096         if (fd >= 0 && !at_exit) {
1097                 rec->bytes_written = 0;
1098                 rec->session->header.data_size = 0;
1099         }
1100
1101         if (!quiet)
1102                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1103                         data->path, timestamp);
1104
1105         if (rec->switch_output.num_files) {
1106                 int n = rec->switch_output.cur_file + 1;
1107
1108                 if (n >= rec->switch_output.num_files)
1109                         n = 0;
1110                 rec->switch_output.cur_file = n;
1111                 if (rec->switch_output.filenames[n]) {
1112                         remove(rec->switch_output.filenames[n]);
1113                         zfree(&rec->switch_output.filenames[n]);
1114                 }
1115                 rec->switch_output.filenames[n] = new_filename;
1116         } else {
1117                 free(new_filename);
1118         }
1119
1120         /* Output tracking events */
1121         if (!at_exit) {
1122                 record__synthesize(rec, false);
1123
1124                 /*
1125                  * In 'perf record --switch-output' without -a,
1126                  * record__synthesize() in record__switch_output() won't
1127                  * generate tracking events because there's no thread_map
1128                  * in evlist. Which causes newly created perf.data doesn't
1129                  * contain map and comm information.
1130                  * Create a fake thread_map and directly call
1131                  * perf_event__synthesize_thread_map() for those events.
1132                  */
1133                 if (target__none(&rec->opts.target))
1134                         record__synthesize_workload(rec, false);
1135         }
1136         return fd;
1137 }
1138
1139 static volatile int workload_exec_errno;
1140
1141 /*
1142  * perf_evlist__prepare_workload will send a SIGUSR1
1143  * if the fork fails, since we asked by setting its
1144  * want_signal to true.
1145  */
1146 static void workload_exec_failed_signal(int signo __maybe_unused,
1147                                         siginfo_t *info,
1148                                         void *ucontext __maybe_unused)
1149 {
1150         workload_exec_errno = info->si_value.sival_int;
1151         done = 1;
1152         child_finished = 1;
1153 }
1154
1155 static void snapshot_sig_handler(int sig);
1156 static void alarm_sig_handler(int sig);
1157
1158 int __weak
1159 perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
1160                             struct perf_tool *tool __maybe_unused,
1161                             perf_event__handler_t process __maybe_unused,
1162                             struct machine *machine __maybe_unused)
1163 {
1164         return 0;
1165 }
1166
1167 static const struct perf_event_mmap_page *
1168 perf_evlist__pick_pc(struct perf_evlist *evlist)
1169 {
1170         if (evlist) {
1171                 if (evlist->mmap && evlist->mmap[0].base)
1172                         return evlist->mmap[0].base;
1173                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
1174                         return evlist->overwrite_mmap[0].base;
1175         }
1176         return NULL;
1177 }
1178
1179 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1180 {
1181         const struct perf_event_mmap_page *pc;
1182
1183         pc = perf_evlist__pick_pc(rec->evlist);
1184         if (pc)
1185                 return pc;
1186         return NULL;
1187 }
1188
1189 static int record__synthesize(struct record *rec, bool tail)
1190 {
1191         struct perf_session *session = rec->session;
1192         struct machine *machine = &session->machines.host;
1193         struct perf_data *data = &rec->data;
1194         struct record_opts *opts = &rec->opts;
1195         struct perf_tool *tool = &rec->tool;
1196         int fd = perf_data__fd(data);
1197         int err = 0;
1198
1199         if (rec->opts.tail_synthesize != tail)
1200                 return 0;
1201
1202         if (data->is_pipe) {
1203                 /*
1204                  * We need to synthesize events first, because some
1205                  * features works on top of them (on report side).
1206                  */
1207                 err = perf_event__synthesize_attrs(tool, rec->evlist,
1208                                                    process_synthesized_event);
1209                 if (err < 0) {
1210                         pr_err("Couldn't synthesize attrs.\n");
1211                         goto out;
1212                 }
1213
1214                 err = perf_event__synthesize_features(tool, session, rec->evlist,
1215                                                       process_synthesized_event);
1216                 if (err < 0) {
1217                         pr_err("Couldn't synthesize features.\n");
1218                         return err;
1219                 }
1220
1221                 if (have_tracepoints(&rec->evlist->entries)) {
1222                         /*
1223                          * FIXME err <= 0 here actually means that
1224                          * there were no tracepoints so its not really
1225                          * an error, just that we don't need to
1226                          * synthesize anything.  We really have to
1227                          * return this more properly and also
1228                          * propagate errors that now are calling die()
1229                          */
1230                         err = perf_event__synthesize_tracing_data(tool, fd, rec->evlist,
1231                                                                   process_synthesized_event);
1232                         if (err <= 0) {
1233                                 pr_err("Couldn't record tracing data.\n");
1234                                 goto out;
1235                         }
1236                         rec->bytes_written += err;
1237                 }
1238         }
1239
1240         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1241                                           process_synthesized_event, machine);
1242         if (err)
1243                 goto out;
1244
1245         if (rec->opts.full_auxtrace) {
1246                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1247                                         session, process_synthesized_event);
1248                 if (err)
1249                         goto out;
1250         }
1251
1252         if (!perf_evlist__exclude_kernel(rec->evlist)) {
1253                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1254                                                          machine);
1255                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1256                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1257                                    "Check /proc/kallsyms permission or run as root.\n");
1258
1259                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1260                                                      machine);
1261                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1262                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1263                                    "Check /proc/modules permission or run as root.\n");
1264         }
1265
1266         if (perf_guest) {
1267                 machines__process_guests(&session->machines,
1268                                          perf_event__synthesize_guest_os, tool);
1269         }
1270
1271         err = perf_event__synthesize_extra_attr(&rec->tool,
1272                                                 rec->evlist,
1273                                                 process_synthesized_event,
1274                                                 data->is_pipe);
1275         if (err)
1276                 goto out;
1277
1278         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->threads,
1279                                                  process_synthesized_event,
1280                                                 NULL);
1281         if (err < 0) {
1282                 pr_err("Couldn't synthesize thread map.\n");
1283                 return err;
1284         }
1285
1286         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->cpus,
1287                                              process_synthesized_event, NULL);
1288         if (err < 0) {
1289                 pr_err("Couldn't synthesize cpu map.\n");
1290                 return err;
1291         }
1292
1293         err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1294                                                 machine, opts);
1295         if (err < 0)
1296                 pr_warning("Couldn't synthesize bpf events.\n");
1297
1298         err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
1299                                             process_synthesized_event, opts->sample_address,
1300                                             1);
1301 out:
1302         return err;
1303 }
1304
1305 static int __cmd_record(struct record *rec, int argc, const char **argv)
1306 {
1307         int err;
1308         int status = 0;
1309         unsigned long waking = 0;
1310         const bool forks = argc > 0;
1311         struct perf_tool *tool = &rec->tool;
1312         struct record_opts *opts = &rec->opts;
1313         struct perf_data *data = &rec->data;
1314         struct perf_session *session;
1315         bool disabled = false, draining = false;
1316         struct perf_evlist *sb_evlist = NULL;
1317         int fd;
1318         float ratio = 0;
1319
1320         atexit(record__sig_exit);
1321         signal(SIGCHLD, sig_handler);
1322         signal(SIGINT, sig_handler);
1323         signal(SIGTERM, sig_handler);
1324         signal(SIGSEGV, sigsegv_handler);
1325
1326         if (rec->opts.record_namespaces)
1327                 tool->namespace_events = true;
1328
1329         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1330                 signal(SIGUSR2, snapshot_sig_handler);
1331                 if (rec->opts.auxtrace_snapshot_mode)
1332                         trigger_on(&auxtrace_snapshot_trigger);
1333                 if (rec->switch_output.enabled)
1334                         trigger_on(&switch_output_trigger);
1335         } else {
1336                 signal(SIGUSR2, SIG_IGN);
1337         }
1338
1339         session = perf_session__new(data, false, tool);
1340         if (session == NULL) {
1341                 pr_err("Perf session creation failed.\n");
1342                 return -1;
1343         }
1344
1345         fd = perf_data__fd(data);
1346         rec->session = session;
1347
1348         if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1349                 pr_err("Compression initialization failed.\n");
1350                 return -1;
1351         }
1352
1353         session->header.env.comp_type  = PERF_COMP_ZSTD;
1354         session->header.env.comp_level = rec->opts.comp_level;
1355
1356         record__init_features(rec);
1357
1358         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1359                 session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1360
1361         if (forks) {
1362                 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1363                                                     argv, data->is_pipe,
1364                                                     workload_exec_failed_signal);
1365                 if (err < 0) {
1366                         pr_err("Couldn't run the workload!\n");
1367                         status = err;
1368                         goto out_delete_session;
1369                 }
1370         }
1371
1372         /*
1373          * If we have just single event and are sending data
1374          * through pipe, we need to force the ids allocation,
1375          * because we synthesize event name through the pipe
1376          * and need the id for that.
1377          */
1378         if (data->is_pipe && rec->evlist->nr_entries == 1)
1379                 rec->opts.sample_id = true;
1380
1381         if (record__open(rec) != 0) {
1382                 err = -1;
1383                 goto out_child;
1384         }
1385         session->header.env.comp_mmap_len = session->evlist->mmap_len;
1386
1387         err = bpf__apply_obj_config();
1388         if (err) {
1389                 char errbuf[BUFSIZ];
1390
1391                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1392                 pr_err("ERROR: Apply config to BPF failed: %s\n",
1393                          errbuf);
1394                 goto out_child;
1395         }
1396
1397         /*
1398          * Normally perf_session__new would do this, but it doesn't have the
1399          * evlist.
1400          */
1401         if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1402                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1403                 rec->tool.ordered_events = false;
1404         }
1405
1406         if (!rec->evlist->nr_groups)
1407                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1408
1409         if (data->is_pipe) {
1410                 err = perf_header__write_pipe(fd);
1411                 if (err < 0)
1412                         goto out_child;
1413         } else {
1414                 err = perf_session__write_header(session, rec->evlist, fd, false);
1415                 if (err < 0)
1416                         goto out_child;
1417         }
1418
1419         if (!rec->no_buildid
1420             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1421                 pr_err("Couldn't generate buildids. "
1422                        "Use --no-buildid to profile anyway.\n");
1423                 err = -1;
1424                 goto out_child;
1425         }
1426
1427         if (!opts->no_bpf_event)
1428                 bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1429
1430         if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1431                 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1432                 opts->no_bpf_event = true;
1433         }
1434
1435         err = record__synthesize(rec, false);
1436         if (err < 0)
1437                 goto out_child;
1438
1439         if (rec->realtime_prio) {
1440                 struct sched_param param;
1441
1442                 param.sched_priority = rec->realtime_prio;
1443                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1444                         pr_err("Could not set realtime priority.\n");
1445                         err = -1;
1446                         goto out_child;
1447                 }
1448         }
1449
1450         /*
1451          * When perf is starting the traced process, all the events
1452          * (apart from group members) have enable_on_exec=1 set,
1453          * so don't spoil it by prematurely enabling them.
1454          */
1455         if (!target__none(&opts->target) && !opts->initial_delay)
1456                 perf_evlist__enable(rec->evlist);
1457
1458         /*
1459          * Let the child rip
1460          */
1461         if (forks) {
1462                 struct machine *machine = &session->machines.host;
1463                 union perf_event *event;
1464                 pid_t tgid;
1465
1466                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1467                 if (event == NULL) {
1468                         err = -ENOMEM;
1469                         goto out_child;
1470                 }
1471
1472                 /*
1473                  * Some H/W events are generated before COMM event
1474                  * which is emitted during exec(), so perf script
1475                  * cannot see a correct process name for those events.
1476                  * Synthesize COMM event to prevent it.
1477                  */
1478                 tgid = perf_event__synthesize_comm(tool, event,
1479                                                    rec->evlist->workload.pid,
1480                                                    process_synthesized_event,
1481                                                    machine);
1482                 free(event);
1483
1484                 if (tgid == -1)
1485                         goto out_child;
1486
1487                 event = malloc(sizeof(event->namespaces) +
1488                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1489                                machine->id_hdr_size);
1490                 if (event == NULL) {
1491                         err = -ENOMEM;
1492                         goto out_child;
1493                 }
1494
1495                 /*
1496                  * Synthesize NAMESPACES event for the command specified.
1497                  */
1498                 perf_event__synthesize_namespaces(tool, event,
1499                                                   rec->evlist->workload.pid,
1500                                                   tgid, process_synthesized_event,
1501                                                   machine);
1502                 free(event);
1503
1504                 perf_evlist__start_workload(rec->evlist);
1505         }
1506
1507         if (opts->initial_delay) {
1508                 usleep(opts->initial_delay * USEC_PER_MSEC);
1509                 perf_evlist__enable(rec->evlist);
1510         }
1511
1512         trigger_ready(&auxtrace_snapshot_trigger);
1513         trigger_ready(&switch_output_trigger);
1514         perf_hooks__invoke_record_start();
1515         for (;;) {
1516                 unsigned long long hits = rec->samples;
1517
1518                 /*
1519                  * rec->evlist->bkw_mmap_state is possible to be
1520                  * BKW_MMAP_EMPTY here: when done == true and
1521                  * hits != rec->samples in previous round.
1522                  *
1523                  * perf_evlist__toggle_bkw_mmap ensure we never
1524                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1525                  */
1526                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1527                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1528
1529                 if (record__mmap_read_all(rec, false) < 0) {
1530                         trigger_error(&auxtrace_snapshot_trigger);
1531                         trigger_error(&switch_output_trigger);
1532                         err = -1;
1533                         goto out_child;
1534                 }
1535
1536                 if (auxtrace_record__snapshot_started) {
1537                         auxtrace_record__snapshot_started = 0;
1538                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
1539                                 record__read_auxtrace_snapshot(rec);
1540                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1541                                 pr_err("AUX area tracing snapshot failed\n");
1542                                 err = -1;
1543                                 goto out_child;
1544                         }
1545                 }
1546
1547                 if (trigger_is_hit(&switch_output_trigger)) {
1548                         /*
1549                          * If switch_output_trigger is hit, the data in
1550                          * overwritable ring buffer should have been collected,
1551                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1552                          *
1553                          * If SIGUSR2 raise after or during record__mmap_read_all(),
1554                          * record__mmap_read_all() didn't collect data from
1555                          * overwritable ring buffer. Read again.
1556                          */
1557                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1558                                 continue;
1559                         trigger_ready(&switch_output_trigger);
1560
1561                         /*
1562                          * Reenable events in overwrite ring buffer after
1563                          * record__mmap_read_all(): we should have collected
1564                          * data from it.
1565                          */
1566                         perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1567
1568                         if (!quiet)
1569                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1570                                         waking);
1571                         waking = 0;
1572                         fd = record__switch_output(rec, false);
1573                         if (fd < 0) {
1574                                 pr_err("Failed to switch to new file\n");
1575                                 trigger_error(&switch_output_trigger);
1576                                 err = fd;
1577                                 goto out_child;
1578                         }
1579
1580                         /* re-arm the alarm */
1581                         if (rec->switch_output.time)
1582                                 alarm(rec->switch_output.time);
1583                 }
1584
1585                 if (hits == rec->samples) {
1586                         if (done || draining)
1587                                 break;
1588                         err = perf_evlist__poll(rec->evlist, -1);
1589                         /*
1590                          * Propagate error, only if there's any. Ignore positive
1591                          * number of returned events and interrupt error.
1592                          */
1593                         if (err > 0 || (err < 0 && errno == EINTR))
1594                                 err = 0;
1595                         waking++;
1596
1597                         if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1598                                 draining = true;
1599                 }
1600
1601                 /*
1602                  * When perf is starting the traced process, at the end events
1603                  * die with the process and we wait for that. Thus no need to
1604                  * disable events in this case.
1605                  */
1606                 if (done && !disabled && !target__none(&opts->target)) {
1607                         trigger_off(&auxtrace_snapshot_trigger);
1608                         perf_evlist__disable(rec->evlist);
1609                         disabled = true;
1610                 }
1611         }
1612         trigger_off(&auxtrace_snapshot_trigger);
1613         trigger_off(&switch_output_trigger);
1614
1615         if (forks && workload_exec_errno) {
1616                 char msg[STRERR_BUFSIZE];
1617                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1618                 pr_err("Workload failed: %s\n", emsg);
1619                 err = -1;
1620                 goto out_child;
1621         }
1622
1623         if (!quiet)
1624                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1625
1626         if (target__none(&rec->opts.target))
1627                 record__synthesize_workload(rec, true);
1628
1629 out_child:
1630         record__mmap_read_all(rec, true);
1631         record__aio_mmap_read_sync(rec);
1632
1633         if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1634                 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1635                 session->header.env.comp_ratio = ratio + 0.5;
1636         }
1637
1638         if (forks) {
1639                 int exit_status;
1640
1641                 if (!child_finished)
1642                         kill(rec->evlist->workload.pid, SIGTERM);
1643
1644                 wait(&exit_status);
1645
1646                 if (err < 0)
1647                         status = err;
1648                 else if (WIFEXITED(exit_status))
1649                         status = WEXITSTATUS(exit_status);
1650                 else if (WIFSIGNALED(exit_status))
1651                         signr = WTERMSIG(exit_status);
1652         } else
1653                 status = err;
1654
1655         record__synthesize(rec, true);
1656         /* this will be recalculated during process_buildids() */
1657         rec->samples = 0;
1658
1659         if (!err) {
1660                 if (!rec->timestamp_filename) {
1661                         record__finish_output(rec);
1662                 } else {
1663                         fd = record__switch_output(rec, true);
1664                         if (fd < 0) {
1665                                 status = fd;
1666                                 goto out_delete_session;
1667                         }
1668                 }
1669         }
1670
1671         perf_hooks__invoke_record_end();
1672
1673         if (!err && !quiet) {
1674                 char samples[128];
1675                 const char *postfix = rec->timestamp_filename ?
1676                                         ".<timestamp>" : "";
1677
1678                 if (rec->samples && !rec->opts.full_auxtrace)
1679                         scnprintf(samples, sizeof(samples),
1680                                   " (%" PRIu64 " samples)", rec->samples);
1681                 else
1682                         samples[0] = '\0';
1683
1684                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
1685                         perf_data__size(data) / 1024.0 / 1024.0,
1686                         data->path, postfix, samples);
1687                 if (ratio) {
1688                         fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
1689                                         rec->session->bytes_transferred / 1024.0 / 1024.0,
1690                                         ratio);
1691                 }
1692                 fprintf(stderr, " ]\n");
1693         }
1694
1695 out_delete_session:
1696         zstd_fini(&session->zstd_data);
1697         perf_session__delete(session);
1698
1699         if (!opts->no_bpf_event)
1700                 perf_evlist__stop_sb_thread(sb_evlist);
1701         return status;
1702 }
1703
1704 static void callchain_debug(struct callchain_param *callchain)
1705 {
1706         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1707
1708         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1709
1710         if (callchain->record_mode == CALLCHAIN_DWARF)
1711                 pr_debug("callchain: stack dump size %d\n",
1712                          callchain->dump_size);
1713 }
1714
1715 int record_opts__parse_callchain(struct record_opts *record,
1716                                  struct callchain_param *callchain,
1717                                  const char *arg, bool unset)
1718 {
1719         int ret;
1720         callchain->enabled = !unset;
1721
1722         /* --no-call-graph */
1723         if (unset) {
1724                 callchain->record_mode = CALLCHAIN_NONE;
1725                 pr_debug("callchain: disabled\n");
1726                 return 0;
1727         }
1728
1729         ret = parse_callchain_record_opt(arg, callchain);
1730         if (!ret) {
1731                 /* Enable data address sampling for DWARF unwind. */
1732                 if (callchain->record_mode == CALLCHAIN_DWARF)
1733                         record->sample_address = true;
1734                 callchain_debug(callchain);
1735         }
1736
1737         return ret;
1738 }
1739
1740 int record_parse_callchain_opt(const struct option *opt,
1741                                const char *arg,
1742                                int unset)
1743 {
1744         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1745 }
1746
1747 int record_callchain_opt(const struct option *opt,
1748                          const char *arg __maybe_unused,
1749                          int unset __maybe_unused)
1750 {
1751         struct callchain_param *callchain = opt->value;
1752
1753         callchain->enabled = true;
1754
1755         if (callchain->record_mode == CALLCHAIN_NONE)
1756                 callchain->record_mode = CALLCHAIN_FP;
1757
1758         callchain_debug(callchain);
1759         return 0;
1760 }
1761
1762 static int perf_record_config(const char *var, const char *value, void *cb)
1763 {
1764         struct record *rec = cb;
1765
1766         if (!strcmp(var, "record.build-id")) {
1767                 if (!strcmp(value, "cache"))
1768                         rec->no_buildid_cache = false;
1769                 else if (!strcmp(value, "no-cache"))
1770                         rec->no_buildid_cache = true;
1771                 else if (!strcmp(value, "skip"))
1772                         rec->no_buildid = true;
1773                 else
1774                         return -1;
1775                 return 0;
1776         }
1777         if (!strcmp(var, "record.call-graph")) {
1778                 var = "call-graph.record-mode";
1779                 return perf_default_config(var, value, cb);
1780         }
1781 #ifdef HAVE_AIO_SUPPORT
1782         if (!strcmp(var, "record.aio")) {
1783                 rec->opts.nr_cblocks = strtol(value, NULL, 0);
1784                 if (!rec->opts.nr_cblocks)
1785                         rec->opts.nr_cblocks = nr_cblocks_default;
1786         }
1787 #endif
1788
1789         return 0;
1790 }
1791
1792 struct clockid_map {
1793         const char *name;
1794         int clockid;
1795 };
1796
1797 #define CLOCKID_MAP(n, c)       \
1798         { .name = n, .clockid = (c), }
1799
1800 #define CLOCKID_END     { .name = NULL, }
1801
1802
1803 /*
1804  * Add the missing ones, we need to build on many distros...
1805  */
1806 #ifndef CLOCK_MONOTONIC_RAW
1807 #define CLOCK_MONOTONIC_RAW 4
1808 #endif
1809 #ifndef CLOCK_BOOTTIME
1810 #define CLOCK_BOOTTIME 7
1811 #endif
1812 #ifndef CLOCK_TAI
1813 #define CLOCK_TAI 11
1814 #endif
1815
1816 static const struct clockid_map clockids[] = {
1817         /* available for all events, NMI safe */
1818         CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1819         CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1820
1821         /* available for some events */
1822         CLOCKID_MAP("realtime", CLOCK_REALTIME),
1823         CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1824         CLOCKID_MAP("tai", CLOCK_TAI),
1825
1826         /* available for the lazy */
1827         CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1828         CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1829         CLOCKID_MAP("real", CLOCK_REALTIME),
1830         CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1831
1832         CLOCKID_END,
1833 };
1834
1835 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1836 {
1837         struct timespec res;
1838
1839         *res_ns = 0;
1840         if (!clock_getres(clk_id, &res))
1841                 *res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1842         else
1843                 pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1844
1845         return 0;
1846 }
1847
1848 static int parse_clockid(const struct option *opt, const char *str, int unset)
1849 {
1850         struct record_opts *opts = (struct record_opts *)opt->value;
1851         const struct clockid_map *cm;
1852         const char *ostr = str;
1853
1854         if (unset) {
1855                 opts->use_clockid = 0;
1856                 return 0;
1857         }
1858
1859         /* no arg passed */
1860         if (!str)
1861                 return 0;
1862
1863         /* no setting it twice */
1864         if (opts->use_clockid)
1865                 return -1;
1866
1867         opts->use_clockid = true;
1868
1869         /* if its a number, we're done */
1870         if (sscanf(str, "%d", &opts->clockid) == 1)
1871                 return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1872
1873         /* allow a "CLOCK_" prefix to the name */
1874         if (!strncasecmp(str, "CLOCK_", 6))
1875                 str += 6;
1876
1877         for (cm = clockids; cm->name; cm++) {
1878                 if (!strcasecmp(str, cm->name)) {
1879                         opts->clockid = cm->clockid;
1880                         return get_clockid_res(opts->clockid,
1881                                                &opts->clockid_res_ns);
1882                 }
1883         }
1884
1885         opts->use_clockid = false;
1886         ui__warning("unknown clockid %s, check man page\n", ostr);
1887         return -1;
1888 }
1889
1890 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1891 {
1892         struct record_opts *opts = (struct record_opts *)opt->value;
1893
1894         if (unset || !str)
1895                 return 0;
1896
1897         if (!strcasecmp(str, "node"))
1898                 opts->affinity = PERF_AFFINITY_NODE;
1899         else if (!strcasecmp(str, "cpu"))
1900                 opts->affinity = PERF_AFFINITY_CPU;
1901
1902         return 0;
1903 }
1904
1905 static int record__parse_mmap_pages(const struct option *opt,
1906                                     const char *str,
1907                                     int unset __maybe_unused)
1908 {
1909         struct record_opts *opts = opt->value;
1910         char *s, *p;
1911         unsigned int mmap_pages;
1912         int ret;
1913
1914         if (!str)
1915                 return -EINVAL;
1916
1917         s = strdup(str);
1918         if (!s)
1919                 return -ENOMEM;
1920
1921         p = strchr(s, ',');
1922         if (p)
1923                 *p = '\0';
1924
1925         if (*s) {
1926                 ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1927                 if (ret)
1928                         goto out_free;
1929                 opts->mmap_pages = mmap_pages;
1930         }
1931
1932         if (!p) {
1933                 ret = 0;
1934                 goto out_free;
1935         }
1936
1937         ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1938         if (ret)
1939                 goto out_free;
1940
1941         opts->auxtrace_mmap_pages = mmap_pages;
1942
1943 out_free:
1944         free(s);
1945         return ret;
1946 }
1947
1948 static void switch_output_size_warn(struct record *rec)
1949 {
1950         u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1951         struct switch_output *s = &rec->switch_output;
1952
1953         wakeup_size /= 2;
1954
1955         if (s->size < wakeup_size) {
1956                 char buf[100];
1957
1958                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1959                 pr_warning("WARNING: switch-output data size lower than "
1960                            "wakeup kernel buffer size (%s) "
1961                            "expect bigger perf.data sizes\n", buf);
1962         }
1963 }
1964
1965 static int switch_output_setup(struct record *rec)
1966 {
1967         struct switch_output *s = &rec->switch_output;
1968         static struct parse_tag tags_size[] = {
1969                 { .tag  = 'B', .mult = 1       },
1970                 { .tag  = 'K', .mult = 1 << 10 },
1971                 { .tag  = 'M', .mult = 1 << 20 },
1972                 { .tag  = 'G', .mult = 1 << 30 },
1973                 { .tag  = 0 },
1974         };
1975         static struct parse_tag tags_time[] = {
1976                 { .tag  = 's', .mult = 1        },
1977                 { .tag  = 'm', .mult = 60       },
1978                 { .tag  = 'h', .mult = 60*60    },
1979                 { .tag  = 'd', .mult = 60*60*24 },
1980                 { .tag  = 0 },
1981         };
1982         unsigned long val;
1983
1984         if (!s->set)
1985                 return 0;
1986
1987         if (!strcmp(s->str, "signal")) {
1988                 s->signal = true;
1989                 pr_debug("switch-output with SIGUSR2 signal\n");
1990                 goto enabled;
1991         }
1992
1993         val = parse_tag_value(s->str, tags_size);
1994         if (val != (unsigned long) -1) {
1995                 s->size = val;
1996                 pr_debug("switch-output with %s size threshold\n", s->str);
1997                 goto enabled;
1998         }
1999
2000         val = parse_tag_value(s->str, tags_time);
2001         if (val != (unsigned long) -1) {
2002                 s->time = val;
2003                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2004                          s->str, s->time);
2005                 goto enabled;
2006         }
2007
2008         return -1;
2009
2010 enabled:
2011         rec->timestamp_filename = true;
2012         s->enabled              = true;
2013
2014         if (s->size && !rec->opts.no_buffering)
2015                 switch_output_size_warn(rec);
2016
2017         return 0;
2018 }
2019
2020 static const char * const __record_usage[] = {
2021         "perf record [<options>] [<command>]",
2022         "perf record [<options>] -- <command> [<options>]",
2023         NULL
2024 };
2025 const char * const *record_usage = __record_usage;
2026
2027 /*
2028  * XXX Ideally would be local to cmd_record() and passed to a record__new
2029  * because we need to have access to it in record__exit, that is called
2030  * after cmd_record() exits, but since record_options need to be accessible to
2031  * builtin-script, leave it here.
2032  *
2033  * At least we don't ouch it in all the other functions here directly.
2034  *
2035  * Just say no to tons of global variables, sigh.
2036  */
2037 static struct record record = {
2038         .opts = {
2039                 .sample_time         = true,
2040                 .mmap_pages          = UINT_MAX,
2041                 .user_freq           = UINT_MAX,
2042                 .user_interval       = ULLONG_MAX,
2043                 .freq                = 4000,
2044                 .target              = {
2045                         .uses_mmap   = true,
2046                         .default_per_cpu = true,
2047                 },
2048                 .mmap_flush          = MMAP_FLUSH_DEFAULT,
2049         },
2050         .tool = {
2051                 .sample         = process_sample_event,
2052                 .fork           = perf_event__process_fork,
2053                 .exit           = perf_event__process_exit,
2054                 .comm           = perf_event__process_comm,
2055                 .namespaces     = perf_event__process_namespaces,
2056                 .mmap           = perf_event__process_mmap,
2057                 .mmap2          = perf_event__process_mmap2,
2058                 .ordered_events = true,
2059         },
2060 };
2061
2062 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2063         "\n\t\t\t\tDefault: fp";
2064
2065 static bool dry_run;
2066
2067 /*
2068  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2069  * with it and switch to use the library functions in perf_evlist that came
2070  * from builtin-record.c, i.e. use record_opts,
2071  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2072  * using pipes, etc.
2073  */
2074 static struct option __record_options[] = {
2075         OPT_CALLBACK('e', "event", &record.evlist, "event",
2076                      "event selector. use 'perf list' to list available events",
2077                      parse_events_option),
2078         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2079                      "event filter", parse_filter),
2080         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2081                            NULL, "don't record events from perf itself",
2082                            exclude_perf),
2083         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2084                     "record events on existing process id"),
2085         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2086                     "record events on existing thread id"),
2087         OPT_INTEGER('r', "realtime", &record.realtime_prio,
2088                     "collect data with this RT SCHED_FIFO priority"),
2089         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2090                     "collect data without buffering"),
2091         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2092                     "collect raw sample records from all opened counters"),
2093         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2094                             "system-wide collection from all CPUs"),
2095         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2096                     "list of cpus to monitor"),
2097         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2098         OPT_STRING('o', "output", &record.data.path, "file",
2099                     "output file name"),
2100         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2101                         &record.opts.no_inherit_set,
2102                         "child tasks do not inherit counters"),
2103         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2104                     "synthesize non-sample events at the end of output"),
2105         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2106         OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2107         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2108                     "Fail if the specified frequency can't be used"),
2109         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2110                      "profile at this frequency",
2111                       record__parse_freq),
2112         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2113                      "number of mmap data pages and AUX area tracing mmap pages",
2114                      record__parse_mmap_pages),
2115         OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2116                      "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2117                      record__mmap_flush_parse),
2118         OPT_BOOLEAN(0, "group", &record.opts.group,
2119                     "put the counters into a counter group"),
2120         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2121                            NULL, "enables call-graph recording" ,
2122                            &record_callchain_opt),
2123         OPT_CALLBACK(0, "call-graph", &record.opts,
2124                      "record_mode[,record_size]", record_callchain_help,
2125                      &record_parse_callchain_opt),
2126         OPT_INCR('v', "verbose", &verbose,
2127                     "be more verbose (show counter open errors, etc)"),
2128         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2129         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2130                     "per thread counts"),
2131         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2132         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2133                     "Record the sample physical addresses"),
2134         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2135         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2136                         &record.opts.sample_time_set,
2137                         "Record the sample timestamps"),
2138         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2139                         "Record the sample period"),
2140         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2141                     "don't sample"),
2142         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2143                         &record.no_buildid_cache_set,
2144                         "do not update the buildid cache"),
2145         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2146                         &record.no_buildid_set,
2147                         "do not collect buildids in perf.data"),
2148         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2149                      "monitor event in cgroup name only",
2150                      parse_cgroups),
2151         OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2152                   "ms to wait before starting measurement after program start"),
2153         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2154                    "user to profile"),
2155
2156         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2157                      "branch any", "sample any taken branches",
2158                      parse_branch_stack),
2159
2160         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2161                      "branch filter mask", "branch stack filter modes",
2162                      parse_branch_stack),
2163         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2164                     "sample by weight (on special events only)"),
2165         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2166                     "sample transaction flags (special events only)"),
2167         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2168                     "use per-thread mmaps"),
2169         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2170                     "sample selected machine registers on interrupt,"
2171                     " use '-I?' to list register names", parse_intr_regs),
2172         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2173                     "sample selected machine registers on interrupt,"
2174                     " use '--user-regs=?' to list register names", parse_user_regs),
2175         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2176                     "Record running/enabled time of read (:S) events"),
2177         OPT_CALLBACK('k', "clockid", &record.opts,
2178         "clockid", "clockid to use for events, see clock_gettime()",
2179         parse_clockid),
2180         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2181                           "opts", "AUX area tracing Snapshot Mode", ""),
2182         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2183                         "per thread proc mmap processing timeout in ms"),
2184         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2185                     "Record namespaces events"),
2186         OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2187                     "Record context switch events"),
2188         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2189                          "Configure all used events to run in kernel space.",
2190                          PARSE_OPT_EXCLUSIVE),
2191         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2192                          "Configure all used events to run in user space.",
2193                          PARSE_OPT_EXCLUSIVE),
2194         OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2195                     "collect kernel callchains"),
2196         OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2197                     "collect user callchains"),
2198         OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2199                    "clang binary to use for compiling BPF scriptlets"),
2200         OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2201                    "options passed to clang when compiling BPF scriptlets"),
2202         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2203                    "file", "vmlinux pathname"),
2204         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2205                     "Record build-id of all DSOs regardless of hits"),
2206         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2207                     "append timestamp to output filename"),
2208         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2209                     "Record timestamp boundary (time of first/last samples)"),
2210         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2211                           &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2212                           "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2213                           "signal"),
2214         OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2215                    "Limit number of switch output generated files"),
2216         OPT_BOOLEAN(0, "dry-run", &dry_run,
2217                     "Parse options then exit"),
2218 #ifdef HAVE_AIO_SUPPORT
2219         OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2220                      &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2221                      record__aio_parse),
2222 #endif
2223         OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2224                      "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2225                      record__parse_affinity),
2226 #ifdef HAVE_ZSTD_SUPPORT
2227         OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2228                             "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2229                             record__parse_comp_level),
2230 #endif
2231         OPT_END()
2232 };
2233
2234 struct option *record_options = __record_options;
2235
2236 int cmd_record(int argc, const char **argv)
2237 {
2238         int err;
2239         struct record *rec = &record;
2240         char errbuf[BUFSIZ];
2241
2242         setlocale(LC_ALL, "");
2243
2244 #ifndef HAVE_LIBBPF_SUPPORT
2245 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2246         set_nobuild('\0', "clang-path", true);
2247         set_nobuild('\0', "clang-opt", true);
2248 # undef set_nobuild
2249 #endif
2250
2251 #ifndef HAVE_BPF_PROLOGUE
2252 # if !defined (HAVE_DWARF_SUPPORT)
2253 #  define REASON  "NO_DWARF=1"
2254 # elif !defined (HAVE_LIBBPF_SUPPORT)
2255 #  define REASON  "NO_LIBBPF=1"
2256 # else
2257 #  define REASON  "this architecture doesn't support BPF prologue"
2258 # endif
2259 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2260         set_nobuild('\0', "vmlinux", true);
2261 # undef set_nobuild
2262 # undef REASON
2263 #endif
2264
2265         CPU_ZERO(&rec->affinity_mask);
2266         rec->opts.affinity = PERF_AFFINITY_SYS;
2267
2268         rec->evlist = perf_evlist__new();
2269         if (rec->evlist == NULL)
2270                 return -ENOMEM;
2271
2272         err = perf_config(perf_record_config, rec);
2273         if (err)
2274                 return err;
2275
2276         argc = parse_options(argc, argv, record_options, record_usage,
2277                             PARSE_OPT_STOP_AT_NON_OPTION);
2278         if (quiet)
2279                 perf_quiet_option();
2280
2281         /* Make system wide (-a) the default target. */
2282         if (!argc && target__none(&rec->opts.target))
2283                 rec->opts.target.system_wide = true;
2284
2285         if (nr_cgroups && !rec->opts.target.system_wide) {
2286                 usage_with_options_msg(record_usage, record_options,
2287                         "cgroup monitoring only available in system-wide mode");
2288
2289         }
2290
2291         if (rec->opts.comp_level != 0) {
2292                 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2293                 rec->no_buildid = true;
2294         }
2295
2296         if (rec->opts.record_switch_events &&
2297             !perf_can_record_switch_events()) {
2298                 ui__error("kernel does not support recording context switch events\n");
2299                 parse_options_usage(record_usage, record_options, "switch-events", 0);
2300                 return -EINVAL;
2301         }
2302
2303         if (switch_output_setup(rec)) {
2304                 parse_options_usage(record_usage, record_options, "switch-output", 0);
2305                 return -EINVAL;
2306         }
2307
2308         if (rec->switch_output.time) {
2309                 signal(SIGALRM, alarm_sig_handler);
2310                 alarm(rec->switch_output.time);
2311         }
2312
2313         if (rec->switch_output.num_files) {
2314                 rec->switch_output.filenames = calloc(sizeof(char *),
2315                                                       rec->switch_output.num_files);
2316                 if (!rec->switch_output.filenames)
2317                         return -EINVAL;
2318         }
2319
2320         /*
2321          * Allow aliases to facilitate the lookup of symbols for address
2322          * filters. Refer to auxtrace_parse_filters().
2323          */
2324         symbol_conf.allow_aliases = true;
2325
2326         symbol__init(NULL);
2327
2328         err = record__auxtrace_init(rec);
2329         if (err)
2330                 goto out;
2331
2332         if (dry_run)
2333                 goto out;
2334
2335         err = bpf__setup_stdout(rec->evlist);
2336         if (err) {
2337                 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2338                 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2339                          errbuf);
2340                 goto out;
2341         }
2342
2343         err = -ENOMEM;
2344
2345         if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
2346                 pr_warning(
2347 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
2348 "check /proc/sys/kernel/kptr_restrict.\n\n"
2349 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
2350 "file is not found in the buildid cache or in the vmlinux path.\n\n"
2351 "Samples in kernel modules won't be resolved at all.\n\n"
2352 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
2353 "even with a suitable vmlinux or kallsyms file.\n\n");
2354
2355         if (rec->no_buildid_cache || rec->no_buildid) {
2356                 disable_buildid_cache();
2357         } else if (rec->switch_output.enabled) {
2358                 /*
2359                  * In 'perf record --switch-output', disable buildid
2360                  * generation by default to reduce data file switching
2361                  * overhead. Still generate buildid if they are required
2362                  * explicitly using
2363                  *
2364                  *  perf record --switch-output --no-no-buildid \
2365                  *              --no-no-buildid-cache
2366                  *
2367                  * Following code equals to:
2368                  *
2369                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
2370                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2371                  *         disable_buildid_cache();
2372                  */
2373                 bool disable = true;
2374
2375                 if (rec->no_buildid_set && !rec->no_buildid)
2376                         disable = false;
2377                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2378                         disable = false;
2379                 if (disable) {
2380                         rec->no_buildid = true;
2381                         rec->no_buildid_cache = true;
2382                         disable_buildid_cache();
2383                 }
2384         }
2385
2386         if (record.opts.overwrite)
2387                 record.opts.tail_synthesize = true;
2388
2389         if (rec->evlist->nr_entries == 0 &&
2390             __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2391                 pr_err("Not enough memory for event selector list\n");
2392                 goto out;
2393         }
2394
2395         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2396                 rec->opts.no_inherit = true;
2397
2398         err = target__validate(&rec->opts.target);
2399         if (err) {
2400                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2401                 ui__warning("%s\n", errbuf);
2402         }
2403
2404         err = target__parse_uid(&rec->opts.target);
2405         if (err) {
2406                 int saved_errno = errno;
2407
2408                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2409                 ui__error("%s", errbuf);
2410
2411                 err = -saved_errno;
2412                 goto out;
2413         }
2414
2415         /* Enable ignoring missing threads when -u/-p option is defined. */
2416         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2417
2418         err = -ENOMEM;
2419         if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2420                 usage_with_options(record_usage, record_options);
2421
2422         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2423         if (err)
2424                 goto out;
2425
2426         /*
2427          * We take all buildids when the file contains
2428          * AUX area tracing data because we do not decode the
2429          * trace because it would take too long.
2430          */
2431         if (rec->opts.full_auxtrace)
2432                 rec->buildid_all = true;
2433
2434         if (record_opts__config(&rec->opts)) {
2435                 err = -EINVAL;
2436                 goto out;
2437         }
2438
2439         if (rec->opts.nr_cblocks > nr_cblocks_max)
2440                 rec->opts.nr_cblocks = nr_cblocks_max;
2441         pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2442
2443         pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2444         pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2445
2446         if (rec->opts.comp_level > comp_level_max)
2447                 rec->opts.comp_level = comp_level_max;
2448         pr_debug("comp level: %d\n", rec->opts.comp_level);
2449
2450         err = __cmd_record(&record, argc, argv);
2451 out:
2452         perf_evlist__delete(rec->evlist);
2453         symbol__exit();
2454         auxtrace_record__free(rec->itr);
2455         return err;
2456 }
2457
2458 static void snapshot_sig_handler(int sig __maybe_unused)
2459 {
2460         struct record *rec = &record;
2461
2462         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2463                 trigger_hit(&auxtrace_snapshot_trigger);
2464                 auxtrace_record__snapshot_started = 1;
2465                 if (auxtrace_record__snapshot_start(record.itr))
2466                         trigger_error(&auxtrace_snapshot_trigger);
2467         }
2468
2469         if (switch_output_signal(rec))
2470                 trigger_hit(&switch_output_trigger);
2471 }
2472
2473 static void alarm_sig_handler(int sig __maybe_unused)
2474 {
2475         struct record *rec = &record;
2476
2477         if (switch_output_time(rec))
2478                 trigger_hit(&switch_output_trigger);
2479 }