1 // SPDX-License-Identifier: GPL-2.0
5 * Builtin record command: Record the profile of a workload
6 * (or a CPU, or a PID) into the perf.data output file - for
7 * later analysis via perf report.
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
49 #include "util/clockid.h"
50 #include "util/pmu-hybrid.h"
51 #include "util/evlist-hybrid.h"
63 #ifdef HAVE_EVENTFD_SUPPORT
64 #include <sys/eventfd.h>
68 #include <sys/types.h>
71 #include <linux/err.h>
72 #include <linux/string.h>
73 #include <linux/time64.h>
74 #include <linux/zalloc.h>
75 #include <linux/bitmap.h>
78 struct switch_output {
91 struct perf_tool tool;
92 struct record_opts opts;
94 struct perf_data data;
95 struct auxtrace_record *itr;
96 struct evlist *evlist;
97 struct perf_session *session;
98 struct evlist *sb_evlist;
101 bool switch_output_event_set;
104 bool no_buildid_cache;
105 bool no_buildid_cache_set;
108 bool timestamp_filename;
109 bool timestamp_boundary;
110 struct switch_output switch_output;
111 unsigned long long samples;
112 struct mmap_cpu_mask affinity_mask;
113 unsigned long output_max_size; /* = 0: unlimited */
114 struct perf_debuginfod debuginfod;
117 static volatile int done;
119 static volatile int auxtrace_record__snapshot_started;
120 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
121 static DEFINE_TRIGGER(switch_output_trigger);
123 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
127 static bool switch_output_signal(struct record *rec)
129 return rec->switch_output.signal &&
130 trigger_is_ready(&switch_output_trigger);
133 static bool switch_output_size(struct record *rec)
135 return rec->switch_output.size &&
136 trigger_is_ready(&switch_output_trigger) &&
137 (rec->bytes_written >= rec->switch_output.size);
140 static bool switch_output_time(struct record *rec)
142 return rec->switch_output.time &&
143 trigger_is_ready(&switch_output_trigger);
146 static bool record__output_max_size_exceeded(struct record *rec)
148 return rec->output_max_size &&
149 (rec->bytes_written >= rec->output_max_size);
152 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
153 void *bf, size_t size)
155 struct perf_data_file *file = &rec->session->data->file;
157 if (perf_data_file__write(file, bf, size) < 0) {
158 pr_err("failed to write perf data, error: %m\n");
162 rec->bytes_written += size;
164 if (record__output_max_size_exceeded(rec) && !done) {
165 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
166 " stopping session ]\n",
167 rec->bytes_written >> 10);
171 if (switch_output_size(rec))
172 trigger_hit(&switch_output_trigger);
177 static int record__aio_enabled(struct record *rec);
178 static int record__comp_enabled(struct record *rec);
179 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
180 void *src, size_t src_size);
182 #ifdef HAVE_AIO_SUPPORT
183 static int record__aio_write(struct aiocb *cblock, int trace_fd,
184 void *buf, size_t size, off_t off)
188 cblock->aio_fildes = trace_fd;
189 cblock->aio_buf = buf;
190 cblock->aio_nbytes = size;
191 cblock->aio_offset = off;
192 cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
195 rc = aio_write(cblock);
198 } else if (errno != EAGAIN) {
199 cblock->aio_fildes = -1;
200 pr_err("failed to queue perf data, error: %m\n");
208 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
214 ssize_t aio_ret, written;
216 aio_errno = aio_error(cblock);
217 if (aio_errno == EINPROGRESS)
220 written = aio_ret = aio_return(cblock);
222 if (aio_errno != EINTR)
223 pr_err("failed to write perf data, error: %m\n");
227 rem_size = cblock->aio_nbytes - written;
230 cblock->aio_fildes = -1;
232 * md->refcount is incremented in record__aio_pushfn() for
233 * every aio write request started in record__aio_push() so
234 * decrement it because the request is now complete.
236 perf_mmap__put(&md->core);
240 * aio write request may require restart with the
241 * reminder if the kernel didn't write whole
244 rem_off = cblock->aio_offset + written;
245 rem_buf = (void *)(cblock->aio_buf + written);
246 record__aio_write(cblock, cblock->aio_fildes,
247 rem_buf, rem_size, rem_off);
254 static int record__aio_sync(struct mmap *md, bool sync_all)
256 struct aiocb **aiocb = md->aio.aiocb;
257 struct aiocb *cblocks = md->aio.cblocks;
258 struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */
263 for (i = 0; i < md->aio.nr_cblocks; ++i) {
264 if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
271 * Started aio write is not complete yet
272 * so it has to be waited before the
275 aiocb[i] = &cblocks[i];
282 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
283 if (!(errno == EAGAIN || errno == EINTR))
284 pr_err("failed to sync perf data, error: %m\n");
295 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
297 struct record_aio *aio = to;
300 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
301 * to release space in the kernel buffer as fast as possible, calling
302 * perf_mmap__consume() from perf_mmap__push() function.
304 * That lets the kernel to proceed with storing more profiling data into
305 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
307 * Coping can be done in two steps in case the chunk of profiling data
308 * crosses the upper bound of the kernel buffer. In this case we first move
309 * part of data from map->start till the upper bound and then the reminder
310 * from the beginning of the kernel buffer till the end of the data chunk.
313 if (record__comp_enabled(aio->rec)) {
314 size = zstd_compress(aio->rec->session, aio->data + aio->size,
315 mmap__mmap_len(map) - aio->size,
318 memcpy(aio->data + aio->size, buf, size);
323 * Increment map->refcount to guard map->aio.data[] buffer
324 * from premature deallocation because map object can be
325 * released earlier than aio write request started on
326 * map->aio.data[] buffer is complete.
328 * perf_mmap__put() is done at record__aio_complete()
329 * after started aio request completion or at record__aio_push()
330 * if the request failed to start.
332 perf_mmap__get(&map->core);
340 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
343 int trace_fd = rec->session->data->file.fd;
344 struct record_aio aio = { .rec = rec, .size = 0 };
347 * Call record__aio_sync() to wait till map->aio.data[] buffer
348 * becomes available after previous aio write operation.
351 idx = record__aio_sync(map, false);
352 aio.data = map->aio.data[idx];
353 ret = perf_mmap__push(map, &aio, record__aio_pushfn);
354 if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
358 ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
361 rec->bytes_written += aio.size;
362 if (switch_output_size(rec))
363 trigger_hit(&switch_output_trigger);
366 * Decrement map->refcount incremented in record__aio_pushfn()
367 * back if record__aio_write() operation failed to start, otherwise
368 * map->refcount is decremented in record__aio_complete() after
369 * aio write operation finishes successfully.
371 perf_mmap__put(&map->core);
377 static off_t record__aio_get_pos(int trace_fd)
379 return lseek(trace_fd, 0, SEEK_CUR);
382 static void record__aio_set_pos(int trace_fd, off_t pos)
384 lseek(trace_fd, pos, SEEK_SET);
387 static void record__aio_mmap_read_sync(struct record *rec)
390 struct evlist *evlist = rec->evlist;
391 struct mmap *maps = evlist->mmap;
393 if (!record__aio_enabled(rec))
396 for (i = 0; i < evlist->core.nr_mmaps; i++) {
397 struct mmap *map = &maps[i];
400 record__aio_sync(map, true);
404 static int nr_cblocks_default = 1;
405 static int nr_cblocks_max = 4;
407 static int record__aio_parse(const struct option *opt,
411 struct record_opts *opts = (struct record_opts *)opt->value;
414 opts->nr_cblocks = 0;
417 opts->nr_cblocks = strtol(str, NULL, 0);
418 if (!opts->nr_cblocks)
419 opts->nr_cblocks = nr_cblocks_default;
424 #else /* HAVE_AIO_SUPPORT */
425 static int nr_cblocks_max = 0;
427 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
428 off_t *off __maybe_unused)
433 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
438 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
442 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
447 static int record__aio_enabled(struct record *rec)
449 return rec->opts.nr_cblocks > 0;
452 #define MMAP_FLUSH_DEFAULT 1
453 static int record__mmap_flush_parse(const struct option *opt,
458 struct record_opts *opts = (struct record_opts *)opt->value;
459 static struct parse_tag tags[] = {
460 { .tag = 'B', .mult = 1 },
461 { .tag = 'K', .mult = 1 << 10 },
462 { .tag = 'M', .mult = 1 << 20 },
463 { .tag = 'G', .mult = 1 << 30 },
471 opts->mmap_flush = parse_tag_value(str, tags);
472 if (opts->mmap_flush == (int)-1)
473 opts->mmap_flush = strtol(str, NULL, 0);
476 if (!opts->mmap_flush)
477 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
479 flush_max = evlist__mmap_size(opts->mmap_pages);
481 if (opts->mmap_flush > flush_max)
482 opts->mmap_flush = flush_max;
487 #ifdef HAVE_ZSTD_SUPPORT
488 static unsigned int comp_level_default = 1;
490 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
492 struct record_opts *opts = opt->value;
495 opts->comp_level = 0;
498 opts->comp_level = strtol(str, NULL, 0);
499 if (!opts->comp_level)
500 opts->comp_level = comp_level_default;
506 static unsigned int comp_level_max = 22;
508 static int record__comp_enabled(struct record *rec)
510 return rec->opts.comp_level > 0;
513 static int process_synthesized_event(struct perf_tool *tool,
514 union perf_event *event,
515 struct perf_sample *sample __maybe_unused,
516 struct machine *machine __maybe_unused)
518 struct record *rec = container_of(tool, struct record, tool);
519 return record__write(rec, NULL, event, event->header.size);
522 static int process_locked_synthesized_event(struct perf_tool *tool,
523 union perf_event *event,
524 struct perf_sample *sample __maybe_unused,
525 struct machine *machine __maybe_unused)
527 static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
530 pthread_mutex_lock(&synth_lock);
531 ret = process_synthesized_event(tool, event, sample, machine);
532 pthread_mutex_unlock(&synth_lock);
536 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
538 struct record *rec = to;
540 if (record__comp_enabled(rec)) {
541 size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
546 return record__write(rec, map, bf, size);
549 static volatile int signr = -1;
550 static volatile int child_finished;
551 #ifdef HAVE_EVENTFD_SUPPORT
552 static int done_fd = -1;
555 static void sig_handler(int sig)
563 #ifdef HAVE_EVENTFD_SUPPORT
567 * It is possible for this signal handler to run after done is checked
568 * in the main loop, but before the perf counter fds are polled. If this
569 * happens, the poll() will continue to wait even though done is set,
570 * and will only break out if either another signal is received, or the
571 * counters are ready for read. To ensure the poll() doesn't sleep when
572 * done is set, use an eventfd (done_fd) to wake up the poll().
574 if (write(done_fd, &tmp, sizeof(tmp)) < 0)
575 pr_err("failed to signal wakeup fd, error: %m\n");
577 #endif // HAVE_EVENTFD_SUPPORT
580 static void sigsegv_handler(int sig)
582 perf_hooks__recover();
583 sighandler_dump_stack(sig);
586 static void record__sig_exit(void)
591 signal(signr, SIG_DFL);
595 #ifdef HAVE_AUXTRACE_SUPPORT
597 static int record__process_auxtrace(struct perf_tool *tool,
599 union perf_event *event, void *data1,
600 size_t len1, void *data2, size_t len2)
602 struct record *rec = container_of(tool, struct record, tool);
603 struct perf_data *data = &rec->data;
607 if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
609 int fd = perf_data__fd(data);
612 file_offset = lseek(fd, 0, SEEK_CUR);
613 if (file_offset == -1)
615 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
621 /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
622 padding = (len1 + len2) & 7;
624 padding = 8 - padding;
626 record__write(rec, map, event, event->header.size);
627 record__write(rec, map, data1, len1);
629 record__write(rec, map, data2, len2);
630 record__write(rec, map, &pad, padding);
635 static int record__auxtrace_mmap_read(struct record *rec,
640 ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
641 record__process_auxtrace);
651 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
656 ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
657 record__process_auxtrace,
658 rec->opts.auxtrace_snapshot_size);
668 static int record__auxtrace_read_snapshot_all(struct record *rec)
673 for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
674 struct mmap *map = &rec->evlist->mmap[i];
676 if (!map->auxtrace_mmap.base)
679 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
688 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
690 pr_debug("Recording AUX area tracing snapshot\n");
691 if (record__auxtrace_read_snapshot_all(rec) < 0) {
692 trigger_error(&auxtrace_snapshot_trigger);
694 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
695 trigger_error(&auxtrace_snapshot_trigger);
697 trigger_ready(&auxtrace_snapshot_trigger);
701 static int record__auxtrace_snapshot_exit(struct record *rec)
703 if (trigger_is_error(&auxtrace_snapshot_trigger))
706 if (!auxtrace_record__snapshot_started &&
707 auxtrace_record__snapshot_start(rec->itr))
710 record__read_auxtrace_snapshot(rec, true);
711 if (trigger_is_error(&auxtrace_snapshot_trigger))
717 static int record__auxtrace_init(struct record *rec)
722 rec->itr = auxtrace_record__init(rec->evlist, &err);
727 err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
728 rec->opts.auxtrace_snapshot_opts);
732 err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
733 rec->opts.auxtrace_sample_opts);
737 auxtrace_regroup_aux_output(rec->evlist);
739 return auxtrace_parse_filters(rec->evlist);
745 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
746 struct mmap *map __maybe_unused)
752 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
753 bool on_exit __maybe_unused)
758 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
764 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
769 static int record__auxtrace_init(struct record *rec __maybe_unused)
776 static int record__config_text_poke(struct evlist *evlist)
781 /* Nothing to do if text poke is already configured */
782 evlist__for_each_entry(evlist, evsel) {
783 if (evsel->core.attr.text_poke)
787 err = parse_events(evlist, "dummy:u", NULL);
791 evsel = evlist__last(evlist);
793 evsel->core.attr.freq = 0;
794 evsel->core.attr.sample_period = 1;
795 evsel->core.attr.text_poke = 1;
796 evsel->core.attr.ksymbol = 1;
798 evsel->core.system_wide = true;
799 evsel->no_aux_samples = true;
800 evsel->immediate = true;
802 /* Text poke must be collected on all CPUs */
803 perf_cpu_map__put(evsel->core.own_cpus);
804 evsel->core.own_cpus = perf_cpu_map__new(NULL);
805 perf_cpu_map__put(evsel->core.cpus);
806 evsel->core.cpus = perf_cpu_map__get(evsel->core.own_cpus);
808 evsel__set_sample_bit(evsel, TIME);
813 static bool record__kcore_readable(struct machine *machine)
815 char kcore[PATH_MAX];
818 scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
820 fd = open(kcore, O_RDONLY);
829 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
831 char from_dir[PATH_MAX];
832 char kcore_dir[PATH_MAX];
835 snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
837 ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
841 return kcore_copy(from_dir, kcore_dir);
844 static int record__mmap_evlist(struct record *rec,
845 struct evlist *evlist)
847 struct record_opts *opts = &rec->opts;
848 bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
849 opts->auxtrace_sample_mode;
852 if (opts->affinity != PERF_AFFINITY_SYS)
853 cpu__setup_cpunode_map();
855 if (evlist__mmap_ex(evlist, opts->mmap_pages,
856 opts->auxtrace_mmap_pages,
858 opts->nr_cblocks, opts->affinity,
859 opts->mmap_flush, opts->comp_level) < 0) {
860 if (errno == EPERM) {
861 pr_err("Permission error mapping pages.\n"
862 "Consider increasing "
863 "/proc/sys/kernel/perf_event_mlock_kb,\n"
864 "or try again with a smaller value of -m/--mmap_pages.\n"
865 "(current value: %u,%u)\n",
866 opts->mmap_pages, opts->auxtrace_mmap_pages);
869 pr_err("failed to mmap with %d (%s)\n", errno,
870 str_error_r(errno, msg, sizeof(msg)));
880 static int record__mmap(struct record *rec)
882 return record__mmap_evlist(rec, rec->evlist);
885 static int record__open(struct record *rec)
889 struct evlist *evlist = rec->evlist;
890 struct perf_session *session = rec->session;
891 struct record_opts *opts = &rec->opts;
895 * For initial_delay, system wide or a hybrid system, we need to add a
896 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
897 * of waiting or event synthesis.
899 if (opts->initial_delay || target__has_cpu(&opts->target) ||
900 perf_pmu__has_hybrid()) {
901 pos = evlist__get_tracking_event(evlist);
902 if (!evsel__is_dummy_event(pos)) {
903 /* Set up dummy event. */
904 if (evlist__add_dummy(evlist))
906 pos = evlist__last(evlist);
907 evlist__set_tracking_event(evlist, pos);
911 * Enable the dummy event when the process is forked for
912 * initial_delay, immediately for system wide.
914 if (opts->initial_delay && !pos->immediate &&
915 !target__has_cpu(&opts->target))
916 pos->core.attr.enable_on_exec = 1;
921 evlist__config(evlist, opts, &callchain_param);
923 evlist__for_each_entry(evlist, pos) {
925 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
926 if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
928 ui__warning("%s\n", msg);
931 if ((errno == EINVAL || errno == EBADF) &&
932 pos->core.leader != &pos->core &&
934 pos = evlist__reset_weak_group(evlist, pos, true);
938 evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
939 ui__error("%s\n", msg);
943 pos->supported = true;
946 if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
948 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
949 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
950 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
951 "file is not found in the buildid cache or in the vmlinux path.\n\n"
952 "Samples in kernel modules won't be resolved at all.\n\n"
953 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
954 "even with a suitable vmlinux or kallsyms file.\n\n");
957 if (evlist__apply_filters(evlist, &pos)) {
958 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
959 pos->filter, evsel__name(pos), errno,
960 str_error_r(errno, msg, sizeof(msg)));
965 rc = record__mmap(rec);
969 session->evlist = evlist;
970 perf_session__set_id_hdr_size(session);
975 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
977 if (rec->evlist->first_sample_time == 0)
978 rec->evlist->first_sample_time = sample_time;
981 rec->evlist->last_sample_time = sample_time;
984 static int process_sample_event(struct perf_tool *tool,
985 union perf_event *event,
986 struct perf_sample *sample,
988 struct machine *machine)
990 struct record *rec = container_of(tool, struct record, tool);
992 set_timestamp_boundary(rec, sample->time);
994 if (rec->buildid_all)
998 return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1001 static int process_buildids(struct record *rec)
1003 struct perf_session *session = rec->session;
1005 if (perf_data__size(&rec->data) == 0)
1009 * During this process, it'll load kernel map and replace the
1010 * dso->long_name to a real pathname it found. In this case
1011 * we prefer the vmlinux path like
1012 * /lib/modules/3.16.4/build/vmlinux
1014 * rather than build-id path (in debug directory).
1015 * $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1017 symbol_conf.ignore_vmlinux_buildid = true;
1020 * If --buildid-all is given, it marks all DSO regardless of hits,
1021 * so no need to process samples. But if timestamp_boundary is enabled,
1022 * it still needs to walk on all samples to get the timestamps of
1023 * first/last samples.
1025 if (rec->buildid_all && !rec->timestamp_boundary)
1026 rec->tool.sample = NULL;
1028 return perf_session__process_events(session);
1031 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1034 struct perf_tool *tool = data;
1036 *As for guest kernel when processing subcommand record&report,
1037 *we arrange module mmap prior to guest kernel mmap and trigger
1038 *a preload dso because default guest module symbols are loaded
1039 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1040 *method is used to avoid symbol missing when the first addr is
1041 *in module instead of in guest kernel.
1043 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1046 pr_err("Couldn't record guest kernel [%d]'s reference"
1047 " relocation symbol.\n", machine->pid);
1050 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1051 * have no _text sometimes.
1053 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1056 pr_err("Couldn't record guest kernel [%d]'s reference"
1057 " relocation symbol.\n", machine->pid);
1060 static struct perf_event_header finished_round_event = {
1061 .size = sizeof(struct perf_event_header),
1062 .type = PERF_RECORD_FINISHED_ROUND,
1065 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1067 if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1068 !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
1069 rec->affinity_mask.nbits)) {
1070 bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
1071 bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
1072 map->affinity_mask.bits, rec->affinity_mask.nbits);
1073 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
1074 (cpu_set_t *)rec->affinity_mask.bits);
1076 mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
1080 static size_t process_comp_header(void *record, size_t increment)
1082 struct perf_record_compressed *event = record;
1083 size_t size = sizeof(*event);
1086 event->header.size += increment;
1090 event->header.type = PERF_RECORD_COMPRESSED;
1091 event->header.size = size;
1096 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
1097 void *src, size_t src_size)
1100 size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1102 compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1103 max_record_size, process_comp_header);
1105 session->bytes_transferred += src_size;
1106 session->bytes_compressed += compressed;
1111 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1112 bool overwrite, bool synch)
1114 u64 bytes_written = rec->bytes_written;
1118 int trace_fd = rec->data.file.fd;
1124 maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1128 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1131 if (record__aio_enabled(rec))
1132 off = record__aio_get_pos(trace_fd);
1134 for (i = 0; i < evlist->core.nr_mmaps; i++) {
1136 struct mmap *map = &maps[i];
1138 if (map->core.base) {
1139 record__adjust_affinity(rec, map);
1141 flush = map->core.flush;
1142 map->core.flush = 1;
1144 if (!record__aio_enabled(rec)) {
1145 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1147 map->core.flush = flush;
1152 if (record__aio_push(rec, map, &off) < 0) {
1153 record__aio_set_pos(trace_fd, off);
1155 map->core.flush = flush;
1161 map->core.flush = flush;
1164 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1165 !rec->opts.auxtrace_sample_mode &&
1166 record__auxtrace_mmap_read(rec, map) != 0) {
1172 if (record__aio_enabled(rec))
1173 record__aio_set_pos(trace_fd, off);
1176 * Mark the round finished in case we wrote
1177 * at least one event.
1179 if (bytes_written != rec->bytes_written)
1180 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1183 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1188 static int record__mmap_read_all(struct record *rec, bool synch)
1192 err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1196 return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1199 static void record__init_features(struct record *rec)
1201 struct perf_session *session = rec->session;
1204 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1205 perf_header__set_feat(&session->header, feat);
1207 if (rec->no_buildid)
1208 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1210 if (!have_tracepoints(&rec->evlist->core.entries))
1211 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1213 if (!rec->opts.branch_stack)
1214 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1216 if (!rec->opts.full_auxtrace)
1217 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1219 if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1220 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1222 if (!rec->opts.use_clockid)
1223 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1225 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1226 if (!record__comp_enabled(rec))
1227 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1229 perf_header__clear_feat(&session->header, HEADER_STAT);
1233 record__finish_output(struct record *rec)
1235 struct perf_data *data = &rec->data;
1236 int fd = perf_data__fd(data);
1241 rec->session->header.data_size += rec->bytes_written;
1242 data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1244 if (!rec->no_buildid) {
1245 process_buildids(rec);
1247 if (rec->buildid_all)
1248 dsos__hit_all(rec->session);
1250 perf_session__write_header(rec->session, rec->evlist, fd, true);
1255 static int record__synthesize_workload(struct record *rec, bool tail)
1258 struct perf_thread_map *thread_map;
1259 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1261 if (rec->opts.tail_synthesize != tail)
1264 thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1265 if (thread_map == NULL)
1268 err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1269 process_synthesized_event,
1270 &rec->session->machines.host,
1272 rec->opts.sample_address);
1273 perf_thread_map__put(thread_map);
1277 static int record__synthesize(struct record *rec, bool tail);
1280 record__switch_output(struct record *rec, bool at_exit)
1282 struct perf_data *data = &rec->data;
1286 /* Same Size: "2015122520103046"*/
1287 char timestamp[] = "InvalidTimestamp";
1289 record__aio_mmap_read_sync(rec);
1291 record__synthesize(rec, true);
1292 if (target__none(&rec->opts.target))
1293 record__synthesize_workload(rec, true);
1296 record__finish_output(rec);
1297 err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1299 pr_err("Failed to get current timestamp\n");
1303 fd = perf_data__switch(data, timestamp,
1304 rec->session->header.data_offset,
1305 at_exit, &new_filename);
1306 if (fd >= 0 && !at_exit) {
1307 rec->bytes_written = 0;
1308 rec->session->header.data_size = 0;
1312 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1313 data->path, timestamp);
1315 if (rec->switch_output.num_files) {
1316 int n = rec->switch_output.cur_file + 1;
1318 if (n >= rec->switch_output.num_files)
1320 rec->switch_output.cur_file = n;
1321 if (rec->switch_output.filenames[n]) {
1322 remove(rec->switch_output.filenames[n]);
1323 zfree(&rec->switch_output.filenames[n]);
1325 rec->switch_output.filenames[n] = new_filename;
1330 /* Output tracking events */
1332 record__synthesize(rec, false);
1335 * In 'perf record --switch-output' without -a,
1336 * record__synthesize() in record__switch_output() won't
1337 * generate tracking events because there's no thread_map
1338 * in evlist. Which causes newly created perf.data doesn't
1339 * contain map and comm information.
1340 * Create a fake thread_map and directly call
1341 * perf_event__synthesize_thread_map() for those events.
1343 if (target__none(&rec->opts.target))
1344 record__synthesize_workload(rec, false);
1349 static volatile int workload_exec_errno;
1352 * evlist__prepare_workload will send a SIGUSR1
1353 * if the fork fails, since we asked by setting its
1354 * want_signal to true.
1356 static void workload_exec_failed_signal(int signo __maybe_unused,
1358 void *ucontext __maybe_unused)
1360 workload_exec_errno = info->si_value.sival_int;
1365 static void snapshot_sig_handler(int sig);
1366 static void alarm_sig_handler(int sig);
1368 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1371 if (evlist->mmap && evlist->mmap[0].core.base)
1372 return evlist->mmap[0].core.base;
1373 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1374 return evlist->overwrite_mmap[0].core.base;
1379 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1381 const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1387 static int record__synthesize(struct record *rec, bool tail)
1389 struct perf_session *session = rec->session;
1390 struct machine *machine = &session->machines.host;
1391 struct perf_data *data = &rec->data;
1392 struct record_opts *opts = &rec->opts;
1393 struct perf_tool *tool = &rec->tool;
1395 event_op f = process_synthesized_event;
1397 if (rec->opts.tail_synthesize != tail)
1400 if (data->is_pipe) {
1401 err = perf_event__synthesize_for_pipe(tool, session, data,
1402 process_synthesized_event);
1406 rec->bytes_written += err;
1409 err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1410 process_synthesized_event, machine);
1414 /* Synthesize id_index before auxtrace_info */
1415 if (rec->opts.auxtrace_sample_mode || rec->opts.full_auxtrace) {
1416 err = perf_event__synthesize_id_index(tool,
1417 process_synthesized_event,
1418 session->evlist, machine);
1423 if (rec->opts.full_auxtrace) {
1424 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1425 session, process_synthesized_event);
1430 if (!evlist__exclude_kernel(rec->evlist)) {
1431 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1433 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1434 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1435 "Check /proc/kallsyms permission or run as root.\n");
1437 err = perf_event__synthesize_modules(tool, process_synthesized_event,
1439 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1440 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1441 "Check /proc/modules permission or run as root.\n");
1445 machines__process_guests(&session->machines,
1446 perf_event__synthesize_guest_os, tool);
1449 err = perf_event__synthesize_extra_attr(&rec->tool,
1451 process_synthesized_event,
1456 err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1457 process_synthesized_event,
1460 pr_err("Couldn't synthesize thread map.\n");
1464 err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1465 process_synthesized_event, NULL);
1467 pr_err("Couldn't synthesize cpu map.\n");
1471 err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1474 pr_warning("Couldn't synthesize bpf events.\n");
1476 if (rec->opts.synth & PERF_SYNTH_CGROUP) {
1477 err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1480 pr_warning("Couldn't synthesize cgroup events.\n");
1483 if (rec->opts.nr_threads_synthesize > 1) {
1484 perf_set_multithreaded();
1485 f = process_locked_synthesized_event;
1488 if (rec->opts.synth & PERF_SYNTH_TASK) {
1489 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1491 err = __machine__synthesize_threads(machine, tool, &opts->target,
1492 rec->evlist->core.threads,
1493 f, needs_mmap, opts->sample_address,
1494 rec->opts.nr_threads_synthesize);
1497 if (rec->opts.nr_threads_synthesize > 1)
1498 perf_set_singlethreaded();
1504 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1506 struct record *rec = data;
1507 pthread_kill(rec->thread_id, SIGUSR2);
1511 static int record__setup_sb_evlist(struct record *rec)
1513 struct record_opts *opts = &rec->opts;
1515 if (rec->sb_evlist != NULL) {
1517 * We get here if --switch-output-event populated the
1518 * sb_evlist, so associate a callback that will send a SIGUSR2
1519 * to the main thread.
1521 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1522 rec->thread_id = pthread_self();
1524 #ifdef HAVE_LIBBPF_SUPPORT
1525 if (!opts->no_bpf_event) {
1526 if (rec->sb_evlist == NULL) {
1527 rec->sb_evlist = evlist__new();
1529 if (rec->sb_evlist == NULL) {
1530 pr_err("Couldn't create side band evlist.\n.");
1535 if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1536 pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1541 if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1542 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1543 opts->no_bpf_event = true;
1549 static int record__init_clock(struct record *rec)
1551 struct perf_session *session = rec->session;
1552 struct timespec ref_clockid;
1553 struct timeval ref_tod;
1556 if (!rec->opts.use_clockid)
1559 if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1560 session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
1562 session->header.env.clock.clockid = rec->opts.clockid;
1564 if (gettimeofday(&ref_tod, NULL) != 0) {
1565 pr_err("gettimeofday failed, cannot set reference time.\n");
1569 if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
1570 pr_err("clock_gettime failed, cannot set reference time.\n");
1574 ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
1575 (u64) ref_tod.tv_usec * NSEC_PER_USEC;
1577 session->header.env.clock.tod_ns = ref;
1579 ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
1580 (u64) ref_clockid.tv_nsec;
1582 session->header.env.clock.clockid_ns = ref;
1586 static void hit_auxtrace_snapshot_trigger(struct record *rec)
1588 if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1589 trigger_hit(&auxtrace_snapshot_trigger);
1590 auxtrace_record__snapshot_started = 1;
1591 if (auxtrace_record__snapshot_start(rec->itr))
1592 trigger_error(&auxtrace_snapshot_trigger);
1596 static void record__uniquify_name(struct record *rec)
1599 struct evlist *evlist = rec->evlist;
1603 if (!perf_pmu__has_hybrid())
1606 evlist__for_each_entry(evlist, pos) {
1607 if (!evsel__is_hybrid(pos))
1610 if (strchr(pos->name, '/'))
1613 ret = asprintf(&new_name, "%s/%s/",
1614 pos->pmu_name, pos->name);
1617 pos->name = new_name;
1622 static int __cmd_record(struct record *rec, int argc, const char **argv)
1626 unsigned long waking = 0;
1627 const bool forks = argc > 0;
1628 struct perf_tool *tool = &rec->tool;
1629 struct record_opts *opts = &rec->opts;
1630 struct perf_data *data = &rec->data;
1631 struct perf_session *session;
1632 bool disabled = false, draining = false;
1635 enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
1637 atexit(record__sig_exit);
1638 signal(SIGCHLD, sig_handler);
1639 signal(SIGINT, sig_handler);
1640 signal(SIGTERM, sig_handler);
1641 signal(SIGSEGV, sigsegv_handler);
1643 if (rec->opts.record_namespaces)
1644 tool->namespace_events = true;
1646 if (rec->opts.record_cgroup) {
1647 #ifdef HAVE_FILE_HANDLE
1648 tool->cgroup_events = true;
1650 pr_err("cgroup tracking is not supported\n");
1655 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1656 signal(SIGUSR2, snapshot_sig_handler);
1657 if (rec->opts.auxtrace_snapshot_mode)
1658 trigger_on(&auxtrace_snapshot_trigger);
1659 if (rec->switch_output.enabled)
1660 trigger_on(&switch_output_trigger);
1662 signal(SIGUSR2, SIG_IGN);
1665 session = perf_session__new(data, tool);
1666 if (IS_ERR(session)) {
1667 pr_err("Perf session creation failed.\n");
1668 return PTR_ERR(session);
1671 fd = perf_data__fd(data);
1672 rec->session = session;
1674 if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1675 pr_err("Compression initialization failed.\n");
1678 #ifdef HAVE_EVENTFD_SUPPORT
1679 done_fd = eventfd(0, EFD_NONBLOCK);
1681 pr_err("Failed to create wakeup eventfd, error: %m\n");
1683 goto out_delete_session;
1685 err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
1687 pr_err("Failed to add wakeup eventfd to poll list\n");
1689 goto out_delete_session;
1691 #endif // HAVE_EVENTFD_SUPPORT
1693 session->header.env.comp_type = PERF_COMP_ZSTD;
1694 session->header.env.comp_level = rec->opts.comp_level;
1696 if (rec->opts.kcore &&
1697 !record__kcore_readable(&session->machines.host)) {
1698 pr_err("ERROR: kcore is not readable.\n");
1702 if (record__init_clock(rec))
1705 record__init_features(rec);
1708 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
1709 workload_exec_failed_signal);
1711 pr_err("Couldn't run the workload!\n");
1713 goto out_delete_session;
1718 * If we have just single event and are sending data
1719 * through pipe, we need to force the ids allocation,
1720 * because we synthesize event name through the pipe
1721 * and need the id for that.
1723 if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1724 rec->opts.sample_id = true;
1726 record__uniquify_name(rec);
1728 if (record__open(rec) != 0) {
1732 session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1734 if (rec->opts.kcore) {
1735 err = record__kcore_copy(&session->machines.host, data);
1737 pr_err("ERROR: Failed to copy kcore\n");
1742 err = bpf__apply_obj_config();
1744 char errbuf[BUFSIZ];
1746 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1747 pr_err("ERROR: Apply config to BPF failed: %s\n",
1753 * Normally perf_session__new would do this, but it doesn't have the
1756 if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
1757 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1758 rec->tool.ordered_events = false;
1761 if (!rec->evlist->core.nr_groups)
1762 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1764 if (data->is_pipe) {
1765 err = perf_header__write_pipe(fd);
1769 err = perf_session__write_header(session, rec->evlist, fd, false);
1775 if (!rec->no_buildid
1776 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1777 pr_err("Couldn't generate buildids. "
1778 "Use --no-buildid to profile anyway.\n");
1782 err = record__setup_sb_evlist(rec);
1786 err = record__synthesize(rec, false);
1790 if (rec->realtime_prio) {
1791 struct sched_param param;
1793 param.sched_priority = rec->realtime_prio;
1794 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
1795 pr_err("Could not set realtime priority.\n");
1802 * When perf is starting the traced process, all the events
1803 * (apart from group members) have enable_on_exec=1 set,
1804 * so don't spoil it by prematurely enabling them.
1806 if (!target__none(&opts->target) && !opts->initial_delay)
1807 evlist__enable(rec->evlist);
1813 struct machine *machine = &session->machines.host;
1814 union perf_event *event;
1817 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1818 if (event == NULL) {
1824 * Some H/W events are generated before COMM event
1825 * which is emitted during exec(), so perf script
1826 * cannot see a correct process name for those events.
1827 * Synthesize COMM event to prevent it.
1829 tgid = perf_event__synthesize_comm(tool, event,
1830 rec->evlist->workload.pid,
1831 process_synthesized_event,
1838 event = malloc(sizeof(event->namespaces) +
1839 (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1840 machine->id_hdr_size);
1841 if (event == NULL) {
1847 * Synthesize NAMESPACES event for the command specified.
1849 perf_event__synthesize_namespaces(tool, event,
1850 rec->evlist->workload.pid,
1851 tgid, process_synthesized_event,
1855 evlist__start_workload(rec->evlist);
1858 if (evlist__initialize_ctlfd(rec->evlist, opts->ctl_fd, opts->ctl_fd_ack))
1861 if (opts->initial_delay) {
1862 pr_info(EVLIST_DISABLED_MSG);
1863 if (opts->initial_delay > 0) {
1864 usleep(opts->initial_delay * USEC_PER_MSEC);
1865 evlist__enable(rec->evlist);
1866 pr_info(EVLIST_ENABLED_MSG);
1870 trigger_ready(&auxtrace_snapshot_trigger);
1871 trigger_ready(&switch_output_trigger);
1872 perf_hooks__invoke_record_start();
1874 unsigned long long hits = rec->samples;
1877 * rec->evlist->bkw_mmap_state is possible to be
1878 * BKW_MMAP_EMPTY here: when done == true and
1879 * hits != rec->samples in previous round.
1881 * evlist__toggle_bkw_mmap ensure we never
1882 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1884 if (trigger_is_hit(&switch_output_trigger) || done || draining)
1885 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1887 if (record__mmap_read_all(rec, false) < 0) {
1888 trigger_error(&auxtrace_snapshot_trigger);
1889 trigger_error(&switch_output_trigger);
1894 if (auxtrace_record__snapshot_started) {
1895 auxtrace_record__snapshot_started = 0;
1896 if (!trigger_is_error(&auxtrace_snapshot_trigger))
1897 record__read_auxtrace_snapshot(rec, false);
1898 if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1899 pr_err("AUX area tracing snapshot failed\n");
1905 if (trigger_is_hit(&switch_output_trigger)) {
1907 * If switch_output_trigger is hit, the data in
1908 * overwritable ring buffer should have been collected,
1909 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1911 * If SIGUSR2 raise after or during record__mmap_read_all(),
1912 * record__mmap_read_all() didn't collect data from
1913 * overwritable ring buffer. Read again.
1915 if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1917 trigger_ready(&switch_output_trigger);
1920 * Reenable events in overwrite ring buffer after
1921 * record__mmap_read_all(): we should have collected
1924 evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1927 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1930 fd = record__switch_output(rec, false);
1932 pr_err("Failed to switch to new file\n");
1933 trigger_error(&switch_output_trigger);
1938 /* re-arm the alarm */
1939 if (rec->switch_output.time)
1940 alarm(rec->switch_output.time);
1943 if (hits == rec->samples) {
1944 if (done || draining)
1946 err = evlist__poll(rec->evlist, -1);
1948 * Propagate error, only if there's any. Ignore positive
1949 * number of returned events and interrupt error.
1951 if (err > 0 || (err < 0 && errno == EINTR))
1955 if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1959 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
1961 case EVLIST_CTL_CMD_SNAPSHOT:
1962 hit_auxtrace_snapshot_trigger(rec);
1963 evlist__ctlfd_ack(rec->evlist);
1965 case EVLIST_CTL_CMD_STOP:
1968 case EVLIST_CTL_CMD_ACK:
1969 case EVLIST_CTL_CMD_UNSUPPORTED:
1970 case EVLIST_CTL_CMD_ENABLE:
1971 case EVLIST_CTL_CMD_DISABLE:
1972 case EVLIST_CTL_CMD_EVLIST:
1973 case EVLIST_CTL_CMD_PING:
1980 * When perf is starting the traced process, at the end events
1981 * die with the process and we wait for that. Thus no need to
1982 * disable events in this case.
1984 if (done && !disabled && !target__none(&opts->target)) {
1985 trigger_off(&auxtrace_snapshot_trigger);
1986 evlist__disable(rec->evlist);
1991 trigger_off(&auxtrace_snapshot_trigger);
1992 trigger_off(&switch_output_trigger);
1994 if (opts->auxtrace_snapshot_on_exit)
1995 record__auxtrace_snapshot_exit(rec);
1997 if (forks && workload_exec_errno) {
1998 char msg[STRERR_BUFSIZE], strevsels[2048];
1999 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2001 evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2003 pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2004 strevsels, argv[0], emsg);
2010 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
2012 if (target__none(&rec->opts.target))
2013 record__synthesize_workload(rec, true);
2016 evlist__finalize_ctlfd(rec->evlist);
2017 record__mmap_read_all(rec, true);
2018 record__aio_mmap_read_sync(rec);
2020 if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2021 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2022 session->header.env.comp_ratio = ratio + 0.5;
2028 if (!child_finished)
2029 kill(rec->evlist->workload.pid, SIGTERM);
2035 else if (WIFEXITED(exit_status))
2036 status = WEXITSTATUS(exit_status);
2037 else if (WIFSIGNALED(exit_status))
2038 signr = WTERMSIG(exit_status);
2042 record__synthesize(rec, true);
2043 /* this will be recalculated during process_buildids() */
2047 if (!rec->timestamp_filename) {
2048 record__finish_output(rec);
2050 fd = record__switch_output(rec, true);
2053 goto out_delete_session;
2058 perf_hooks__invoke_record_end();
2060 if (!err && !quiet) {
2062 const char *postfix = rec->timestamp_filename ?
2063 ".<timestamp>" : "";
2065 if (rec->samples && !rec->opts.full_auxtrace)
2066 scnprintf(samples, sizeof(samples),
2067 " (%" PRIu64 " samples)", rec->samples);
2071 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
2072 perf_data__size(data) / 1024.0 / 1024.0,
2073 data->path, postfix, samples);
2075 fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
2076 rec->session->bytes_transferred / 1024.0 / 1024.0,
2079 fprintf(stderr, " ]\n");
2083 #ifdef HAVE_EVENTFD_SUPPORT
2087 zstd_fini(&session->zstd_data);
2088 perf_session__delete(session);
2090 if (!opts->no_bpf_event)
2091 evlist__stop_sb_thread(rec->sb_evlist);
2095 static void callchain_debug(struct callchain_param *callchain)
2097 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2099 pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2101 if (callchain->record_mode == CALLCHAIN_DWARF)
2102 pr_debug("callchain: stack dump size %d\n",
2103 callchain->dump_size);
2106 int record_opts__parse_callchain(struct record_opts *record,
2107 struct callchain_param *callchain,
2108 const char *arg, bool unset)
2111 callchain->enabled = !unset;
2113 /* --no-call-graph */
2115 callchain->record_mode = CALLCHAIN_NONE;
2116 pr_debug("callchain: disabled\n");
2120 ret = parse_callchain_record_opt(arg, callchain);
2122 /* Enable data address sampling for DWARF unwind. */
2123 if (callchain->record_mode == CALLCHAIN_DWARF)
2124 record->sample_address = true;
2125 callchain_debug(callchain);
2131 int record_parse_callchain_opt(const struct option *opt,
2135 return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2138 int record_callchain_opt(const struct option *opt,
2139 const char *arg __maybe_unused,
2140 int unset __maybe_unused)
2142 struct callchain_param *callchain = opt->value;
2144 callchain->enabled = true;
2146 if (callchain->record_mode == CALLCHAIN_NONE)
2147 callchain->record_mode = CALLCHAIN_FP;
2149 callchain_debug(callchain);
2153 static int perf_record_config(const char *var, const char *value, void *cb)
2155 struct record *rec = cb;
2157 if (!strcmp(var, "record.build-id")) {
2158 if (!strcmp(value, "cache"))
2159 rec->no_buildid_cache = false;
2160 else if (!strcmp(value, "no-cache"))
2161 rec->no_buildid_cache = true;
2162 else if (!strcmp(value, "skip"))
2163 rec->no_buildid = true;
2164 else if (!strcmp(value, "mmap"))
2165 rec->buildid_mmap = true;
2170 if (!strcmp(var, "record.call-graph")) {
2171 var = "call-graph.record-mode";
2172 return perf_default_config(var, value, cb);
2174 #ifdef HAVE_AIO_SUPPORT
2175 if (!strcmp(var, "record.aio")) {
2176 rec->opts.nr_cblocks = strtol(value, NULL, 0);
2177 if (!rec->opts.nr_cblocks)
2178 rec->opts.nr_cblocks = nr_cblocks_default;
2181 if (!strcmp(var, "record.debuginfod")) {
2182 rec->debuginfod.urls = strdup(value);
2183 if (!rec->debuginfod.urls)
2185 rec->debuginfod.set = true;
2192 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2194 struct record_opts *opts = (struct record_opts *)opt->value;
2199 if (!strcasecmp(str, "node"))
2200 opts->affinity = PERF_AFFINITY_NODE;
2201 else if (!strcasecmp(str, "cpu"))
2202 opts->affinity = PERF_AFFINITY_CPU;
2207 static int parse_output_max_size(const struct option *opt,
2208 const char *str, int unset)
2210 unsigned long *s = (unsigned long *)opt->value;
2211 static struct parse_tag tags_size[] = {
2212 { .tag = 'B', .mult = 1 },
2213 { .tag = 'K', .mult = 1 << 10 },
2214 { .tag = 'M', .mult = 1 << 20 },
2215 { .tag = 'G', .mult = 1 << 30 },
2225 val = parse_tag_value(str, tags_size);
2226 if (val != (unsigned long) -1) {
2234 static int record__parse_mmap_pages(const struct option *opt,
2236 int unset __maybe_unused)
2238 struct record_opts *opts = opt->value;
2240 unsigned int mmap_pages;
2255 ret = __evlist__parse_mmap_pages(&mmap_pages, s);
2258 opts->mmap_pages = mmap_pages;
2266 ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
2270 opts->auxtrace_mmap_pages = mmap_pages;
2277 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
2281 static int parse_control_option(const struct option *opt,
2283 int unset __maybe_unused)
2285 struct record_opts *opts = opt->value;
2287 return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
2290 static void switch_output_size_warn(struct record *rec)
2292 u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2293 struct switch_output *s = &rec->switch_output;
2297 if (s->size < wakeup_size) {
2300 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2301 pr_warning("WARNING: switch-output data size lower than "
2302 "wakeup kernel buffer size (%s) "
2303 "expect bigger perf.data sizes\n", buf);
2307 static int switch_output_setup(struct record *rec)
2309 struct switch_output *s = &rec->switch_output;
2310 static struct parse_tag tags_size[] = {
2311 { .tag = 'B', .mult = 1 },
2312 { .tag = 'K', .mult = 1 << 10 },
2313 { .tag = 'M', .mult = 1 << 20 },
2314 { .tag = 'G', .mult = 1 << 30 },
2317 static struct parse_tag tags_time[] = {
2318 { .tag = 's', .mult = 1 },
2319 { .tag = 'm', .mult = 60 },
2320 { .tag = 'h', .mult = 60*60 },
2321 { .tag = 'd', .mult = 60*60*24 },
2327 * If we're using --switch-output-events, then we imply its
2328 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2329 * thread to its parent.
2331 if (rec->switch_output_event_set)
2337 if (!strcmp(s->str, "signal")) {
2340 pr_debug("switch-output with SIGUSR2 signal\n");
2344 val = parse_tag_value(s->str, tags_size);
2345 if (val != (unsigned long) -1) {
2347 pr_debug("switch-output with %s size threshold\n", s->str);
2351 val = parse_tag_value(s->str, tags_time);
2352 if (val != (unsigned long) -1) {
2354 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2362 rec->timestamp_filename = true;
2365 if (s->size && !rec->opts.no_buffering)
2366 switch_output_size_warn(rec);
2371 static const char * const __record_usage[] = {
2372 "perf record [<options>] [<command>]",
2373 "perf record [<options>] -- <command> [<options>]",
2376 const char * const *record_usage = __record_usage;
2378 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2379 struct perf_sample *sample, struct machine *machine)
2382 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2383 * no need to add them twice.
2385 if (!(event->header.misc & PERF_RECORD_MISC_USER))
2387 return perf_event__process_mmap(tool, event, sample, machine);
2390 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2391 struct perf_sample *sample, struct machine *machine)
2394 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2395 * no need to add them twice.
2397 if (!(event->header.misc & PERF_RECORD_MISC_USER))
2400 return perf_event__process_mmap2(tool, event, sample, machine);
2403 static int process_timestamp_boundary(struct perf_tool *tool,
2404 union perf_event *event __maybe_unused,
2405 struct perf_sample *sample,
2406 struct machine *machine __maybe_unused)
2408 struct record *rec = container_of(tool, struct record, tool);
2410 set_timestamp_boundary(rec, sample->time);
2414 static int parse_record_synth_option(const struct option *opt,
2416 int unset __maybe_unused)
2418 struct record_opts *opts = opt->value;
2419 char *p = strdup(str);
2424 opts->synth = parse_synth_opt(p);
2427 if (opts->synth < 0) {
2428 pr_err("Invalid synth option: %s\n", str);
2435 * XXX Ideally would be local to cmd_record() and passed to a record__new
2436 * because we need to have access to it in record__exit, that is called
2437 * after cmd_record() exits, but since record_options need to be accessible to
2438 * builtin-script, leave it here.
2440 * At least we don't ouch it in all the other functions here directly.
2442 * Just say no to tons of global variables, sigh.
2444 static struct record record = {
2446 .sample_time = true,
2447 .mmap_pages = UINT_MAX,
2448 .user_freq = UINT_MAX,
2449 .user_interval = ULLONG_MAX,
2453 .default_per_cpu = true,
2455 .mmap_flush = MMAP_FLUSH_DEFAULT,
2456 .nr_threads_synthesize = 1,
2459 .synth = PERF_SYNTH_ALL,
2462 .sample = process_sample_event,
2463 .fork = perf_event__process_fork,
2464 .exit = perf_event__process_exit,
2465 .comm = perf_event__process_comm,
2466 .namespaces = perf_event__process_namespaces,
2467 .mmap = build_id__process_mmap,
2468 .mmap2 = build_id__process_mmap2,
2469 .itrace_start = process_timestamp_boundary,
2470 .aux = process_timestamp_boundary,
2471 .ordered_events = true,
2475 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2476 "\n\t\t\t\tDefault: fp";
2478 static bool dry_run;
2481 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2482 * with it and switch to use the library functions in perf_evlist that came
2483 * from builtin-record.c, i.e. use record_opts,
2484 * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2487 static struct option __record_options[] = {
2488 OPT_CALLBACK('e', "event", &record.evlist, "event",
2489 "event selector. use 'perf list' to list available events",
2490 parse_events_option),
2491 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2492 "event filter", parse_filter),
2493 OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2494 NULL, "don't record events from perf itself",
2496 OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2497 "record events on existing process id"),
2498 OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2499 "record events on existing thread id"),
2500 OPT_INTEGER('r', "realtime", &record.realtime_prio,
2501 "collect data with this RT SCHED_FIFO priority"),
2502 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2503 "collect data without buffering"),
2504 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2505 "collect raw sample records from all opened counters"),
2506 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2507 "system-wide collection from all CPUs"),
2508 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2509 "list of cpus to monitor"),
2510 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2511 OPT_STRING('o', "output", &record.data.path, "file",
2512 "output file name"),
2513 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2514 &record.opts.no_inherit_set,
2515 "child tasks do not inherit counters"),
2516 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2517 "synthesize non-sample events at the end of output"),
2518 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2519 OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
2520 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2521 "Fail if the specified frequency can't be used"),
2522 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2523 "profile at this frequency",
2524 record__parse_freq),
2525 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2526 "number of mmap data pages and AUX area tracing mmap pages",
2527 record__parse_mmap_pages),
2528 OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2529 "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2530 record__mmap_flush_parse),
2531 OPT_BOOLEAN(0, "group", &record.opts.group,
2532 "put the counters into a counter group"),
2533 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2534 NULL, "enables call-graph recording" ,
2535 &record_callchain_opt),
2536 OPT_CALLBACK(0, "call-graph", &record.opts,
2537 "record_mode[,record_size]", record_callchain_help,
2538 &record_parse_callchain_opt),
2539 OPT_INCR('v', "verbose", &verbose,
2540 "be more verbose (show counter open errors, etc)"),
2541 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2542 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2543 "per thread counts"),
2544 OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2545 OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2546 "Record the sample physical addresses"),
2547 OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
2548 "Record the sampled data address data page size"),
2549 OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
2550 "Record the sampled code address (ip) page size"),
2551 OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2552 OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2553 &record.opts.sample_time_set,
2554 "Record the sample timestamps"),
2555 OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2556 "Record the sample period"),
2557 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2559 OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2560 &record.no_buildid_cache_set,
2561 "do not update the buildid cache"),
2562 OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2563 &record.no_buildid_set,
2564 "do not collect buildids in perf.data"),
2565 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2566 "monitor event in cgroup name only",
2568 OPT_INTEGER('D', "delay", &record.opts.initial_delay,
2569 "ms to wait before starting measurement after program start (-1: start with events disabled)"),
2570 OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2571 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2574 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2575 "branch any", "sample any taken branches",
2576 parse_branch_stack),
2578 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2579 "branch filter mask", "branch stack filter modes",
2580 parse_branch_stack),
2581 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2582 "sample by weight (on special events only)"),
2583 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2584 "sample transaction flags (special events only)"),
2585 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2586 "use per-thread mmaps"),
2587 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2588 "sample selected machine registers on interrupt,"
2589 " use '-I?' to list register names", parse_intr_regs),
2590 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2591 "sample selected machine registers on interrupt,"
2592 " use '--user-regs=?' to list register names", parse_user_regs),
2593 OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2594 "Record running/enabled time of read (:S) events"),
2595 OPT_CALLBACK('k', "clockid", &record.opts,
2596 "clockid", "clockid to use for events, see clock_gettime()",
2598 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2599 "opts", "AUX area tracing Snapshot Mode", ""),
2600 OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2601 "opts", "sample AUX area", ""),
2602 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2603 "per thread proc mmap processing timeout in ms"),
2604 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2605 "Record namespaces events"),
2606 OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2607 "Record cgroup events"),
2608 OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
2609 &record.opts.record_switch_events_set,
2610 "Record context switch events"),
2611 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2612 "Configure all used events to run in kernel space.",
2613 PARSE_OPT_EXCLUSIVE),
2614 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2615 "Configure all used events to run in user space.",
2616 PARSE_OPT_EXCLUSIVE),
2617 OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2618 "collect kernel callchains"),
2619 OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2620 "collect user callchains"),
2621 OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2622 "clang binary to use for compiling BPF scriptlets"),
2623 OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2624 "options passed to clang when compiling BPF scriptlets"),
2625 OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2626 "file", "vmlinux pathname"),
2627 OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2628 "Record build-id of all DSOs regardless of hits"),
2629 OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
2630 "Record build-id in map events"),
2631 OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2632 "append timestamp to output filename"),
2633 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2634 "Record timestamp boundary (time of first/last samples)"),
2635 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2636 &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2637 "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2639 OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
2640 "switch output event selector. use 'perf list' to list available events",
2641 parse_events_option_new_evlist),
2642 OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2643 "Limit number of switch output generated files"),
2644 OPT_BOOLEAN(0, "dry-run", &dry_run,
2645 "Parse options then exit"),
2646 #ifdef HAVE_AIO_SUPPORT
2647 OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2648 &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2651 OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2652 "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2653 record__parse_affinity),
2654 #ifdef HAVE_ZSTD_SUPPORT
2655 OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2656 "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2657 record__parse_comp_level),
2659 OPT_CALLBACK(0, "max-size", &record.output_max_size,
2660 "size", "Limit the maximum size of the output file", parse_output_max_size),
2661 OPT_UINTEGER(0, "num-thread-synthesize",
2662 &record.opts.nr_threads_synthesize,
2663 "number of threads to run for event synthesis"),
2665 OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
2666 "libpfm4 event selector. use 'perf list' to list available events",
2667 parse_libpfm_events_option),
2669 OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
2670 "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
2671 "\t\t\t 'snapshot': AUX area tracing snapshot).\n"
2672 "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
2673 "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
2674 parse_control_option),
2675 OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
2676 "Fine-tune event synthesis: default=all", parse_record_synth_option),
2677 OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
2678 &record.debuginfod.set, "debuginfod urls",
2679 "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
2684 struct option *record_options = __record_options;
2686 int cmd_record(int argc, const char **argv)
2689 struct record *rec = &record;
2690 char errbuf[BUFSIZ];
2692 setlocale(LC_ALL, "");
2694 #ifndef HAVE_LIBBPF_SUPPORT
2695 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2696 set_nobuild('\0', "clang-path", true);
2697 set_nobuild('\0', "clang-opt", true);
2701 #ifndef HAVE_BPF_PROLOGUE
2702 # if !defined (HAVE_DWARF_SUPPORT)
2703 # define REASON "NO_DWARF=1"
2704 # elif !defined (HAVE_LIBBPF_SUPPORT)
2705 # define REASON "NO_LIBBPF=1"
2707 # define REASON "this architecture doesn't support BPF prologue"
2709 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2710 set_nobuild('\0', "vmlinux", true);
2715 rec->opts.affinity = PERF_AFFINITY_SYS;
2717 rec->evlist = evlist__new();
2718 if (rec->evlist == NULL)
2721 err = perf_config(perf_record_config, rec);
2725 argc = parse_options(argc, argv, record_options, record_usage,
2726 PARSE_OPT_STOP_AT_NON_OPTION);
2728 perf_quiet_option();
2730 err = symbol__validate_sym_arguments();
2734 perf_debuginfod_setup(&record.debuginfod);
2736 /* Make system wide (-a) the default target. */
2737 if (!argc && target__none(&rec->opts.target))
2738 rec->opts.target.system_wide = true;
2740 if (nr_cgroups && !rec->opts.target.system_wide) {
2741 usage_with_options_msg(record_usage, record_options,
2742 "cgroup monitoring only available in system-wide mode");
2746 if (rec->buildid_mmap) {
2747 if (!perf_can_record_build_id()) {
2748 pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
2752 pr_debug("Enabling build id in mmap2 events.\n");
2753 /* Enable mmap build id synthesizing. */
2754 symbol_conf.buildid_mmap2 = true;
2755 /* Enable perf_event_attr::build_id bit. */
2756 rec->opts.build_id = true;
2757 /* Disable build id cache. */
2758 rec->no_buildid = true;
2761 if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
2762 pr_err("Kernel has no cgroup sampling support.\n");
2767 if (rec->opts.kcore)
2768 rec->data.is_dir = true;
2770 if (rec->opts.comp_level != 0) {
2771 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2772 rec->no_buildid = true;
2775 if (rec->opts.record_switch_events &&
2776 !perf_can_record_switch_events()) {
2777 ui__error("kernel does not support recording context switch events\n");
2778 parse_options_usage(record_usage, record_options, "switch-events", 0);
2783 if (switch_output_setup(rec)) {
2784 parse_options_usage(record_usage, record_options, "switch-output", 0);
2789 if (rec->switch_output.time) {
2790 signal(SIGALRM, alarm_sig_handler);
2791 alarm(rec->switch_output.time);
2794 if (rec->switch_output.num_files) {
2795 rec->switch_output.filenames = calloc(sizeof(char *),
2796 rec->switch_output.num_files);
2797 if (!rec->switch_output.filenames) {
2804 * Allow aliases to facilitate the lookup of symbols for address
2805 * filters. Refer to auxtrace_parse_filters().
2807 symbol_conf.allow_aliases = true;
2811 if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2812 rec->affinity_mask.nbits = cpu__max_cpu().cpu;
2813 rec->affinity_mask.bits = bitmap_zalloc(rec->affinity_mask.nbits);
2814 if (!rec->affinity_mask.bits) {
2815 pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2819 pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2822 err = record__auxtrace_init(rec);
2829 err = bpf__setup_stdout(rec->evlist);
2831 bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2832 pr_err("ERROR: Setup BPF stdout failed: %s\n",
2839 if (rec->no_buildid_cache || rec->no_buildid) {
2840 disable_buildid_cache();
2841 } else if (rec->switch_output.enabled) {
2843 * In 'perf record --switch-output', disable buildid
2844 * generation by default to reduce data file switching
2845 * overhead. Still generate buildid if they are required
2848 * perf record --switch-output --no-no-buildid \
2849 * --no-no-buildid-cache
2851 * Following code equals to:
2853 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2854 * (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2855 * disable_buildid_cache();
2857 bool disable = true;
2859 if (rec->no_buildid_set && !rec->no_buildid)
2861 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2864 rec->no_buildid = true;
2865 rec->no_buildid_cache = true;
2866 disable_buildid_cache();
2870 if (record.opts.overwrite)
2871 record.opts.tail_synthesize = true;
2873 if (rec->evlist->core.nr_entries == 0) {
2874 if (perf_pmu__has_hybrid()) {
2875 err = evlist__add_default_hybrid(rec->evlist,
2876 !record.opts.no_samples);
2878 err = __evlist__add_default(rec->evlist,
2879 !record.opts.no_samples);
2883 pr_err("Not enough memory for event selector list\n");
2888 if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2889 rec->opts.no_inherit = true;
2891 err = target__validate(&rec->opts.target);
2893 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2894 ui__warning("%s\n", errbuf);
2897 err = target__parse_uid(&rec->opts.target);
2899 int saved_errno = errno;
2901 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2902 ui__error("%s", errbuf);
2908 /* Enable ignoring missing threads when -u/-p option is defined. */
2909 rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2911 if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
2912 pr_err("failed to use cpu list %s\n",
2913 rec->opts.target.cpu_list);
2917 rec->opts.target.hybrid = perf_pmu__has_hybrid();
2919 if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
2920 arch__add_leaf_frame_record_opts(&rec->opts);
2923 if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2924 usage_with_options(record_usage, record_options);
2926 err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2931 * We take all buildids when the file contains
2932 * AUX area tracing data because we do not decode the
2933 * trace because it would take too long.
2935 if (rec->opts.full_auxtrace)
2936 rec->buildid_all = true;
2938 if (rec->opts.text_poke) {
2939 err = record__config_text_poke(rec->evlist);
2941 pr_err("record__config_text_poke failed, error %d\n", err);
2946 if (record_opts__config(&rec->opts)) {
2951 if (rec->opts.nr_cblocks > nr_cblocks_max)
2952 rec->opts.nr_cblocks = nr_cblocks_max;
2953 pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2955 pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2956 pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2958 if (rec->opts.comp_level > comp_level_max)
2959 rec->opts.comp_level = comp_level_max;
2960 pr_debug("comp level: %d\n", rec->opts.comp_level);
2962 err = __cmd_record(&record, argc, argv);
2964 bitmap_free(rec->affinity_mask.bits);
2965 evlist__delete(rec->evlist);
2967 auxtrace_record__free(rec->itr);
2969 evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
2973 static void snapshot_sig_handler(int sig __maybe_unused)
2975 struct record *rec = &record;
2977 hit_auxtrace_snapshot_trigger(rec);
2979 if (switch_output_signal(rec))
2980 trigger_hit(&switch_output_trigger);
2983 static void alarm_sig_handler(int sig __maybe_unused)
2985 struct record *rec = &record;
2987 if (switch_output_time(rec))
2988 trigger_hit(&switch_output_trigger);