4 * Builtin 'trace' command:
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
16 * Released under the GPL v2. (and only v2, not any later version)
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
62 #include "sane_ctype.h"
65 # define O_CLOEXEC 02000000
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE 1024
73 struct perf_tool tool;
74 struct syscalltbl *sctbl;
77 struct syscall *table;
79 struct perf_evsel *sys_enter,
83 struct record_opts opts;
84 struct perf_evlist *evlist;
86 struct thread *current;
87 struct cgroup *cgroup;
90 unsigned long nr_events;
91 struct strlist *ev_qualifier;
100 double duration_filter;
106 unsigned int max_stack;
107 unsigned int min_stack;
108 bool not_ev_qualifier;
112 bool multiple_threads;
118 bool show_tool_stats;
120 bool kernel_syscallchains;
129 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
130 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
134 #define TP_UINT_FIELD(bits) \
135 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
138 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
147 #define TP_UINT_FIELD__SWAPPED(bits) \
148 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
151 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
152 return bswap_##bits(value);\
155 TP_UINT_FIELD__SWAPPED(16);
156 TP_UINT_FIELD__SWAPPED(32);
157 TP_UINT_FIELD__SWAPPED(64);
159 static int tp_field__init_uint(struct tp_field *field,
160 struct format_field *format_field,
163 field->offset = format_field->offset;
165 switch (format_field->size) {
167 field->integer = tp_field__u8;
170 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
173 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
176 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
185 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
187 return sample->raw_data + field->offset;
190 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
192 field->offset = format_field->offset;
193 field->pointer = tp_field__ptr;
200 struct tp_field args, ret;
204 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
205 struct tp_field *field,
208 struct format_field *format_field = perf_evsel__field(evsel, name);
210 if (format_field == NULL)
213 return tp_field__init_uint(field, format_field, evsel->needs_swap);
216 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
217 ({ struct syscall_tp *sc = evsel->priv;\
218 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
220 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
221 struct tp_field *field,
224 struct format_field *format_field = perf_evsel__field(evsel, name);
226 if (format_field == NULL)
229 return tp_field__init_ptr(field, format_field);
232 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
233 ({ struct syscall_tp *sc = evsel->priv;\
234 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
236 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
239 perf_evsel__delete(evsel);
242 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
244 evsel->priv = malloc(sizeof(struct syscall_tp));
245 if (evsel->priv != NULL) {
246 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
249 evsel->handler = handler;
260 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
262 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
264 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
266 evsel = perf_evsel__newtp("syscalls", direction);
271 if (perf_evsel__init_syscall_tp(evsel, handler))
277 perf_evsel__delete_priv(evsel);
281 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
282 ({ struct syscall_tp *fields = evsel->priv; \
283 fields->name.integer(&fields->name, sample); })
285 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
286 ({ struct syscall_tp *fields = evsel->priv; \
287 fields->name.pointer(&fields->name, sample); })
289 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
291 int idx = val - sa->offset;
293 if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL)
294 return scnprintf(bf, size, intfmt, val);
296 return scnprintf(bf, size, "%s", sa->entries[idx]);
299 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
301 struct syscall_arg *arg)
303 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
306 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
307 struct syscall_arg *arg)
309 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
312 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
316 struct strarray **entries;
319 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
320 .nr_entries = ARRAY_SIZE(array), \
324 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
325 struct syscall_arg *arg)
327 struct strarrays *sas = arg->parm;
330 for (i = 0; i < sas->nr_entries; ++i) {
331 struct strarray *sa = sas->entries[i];
332 int idx = arg->val - sa->offset;
334 if (idx >= 0 && idx < sa->nr_entries) {
335 if (sa->entries[idx] == NULL)
337 return scnprintf(bf, size, "%s", sa->entries[idx]);
341 return scnprintf(bf, size, "%d", arg->val);
345 #define AT_FDCWD -100
348 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
349 struct syscall_arg *arg)
354 return scnprintf(bf, size, "CWD");
356 return syscall_arg__scnprintf_fd(bf, size, arg);
359 #define SCA_FDAT syscall_arg__scnprintf_fd_at
361 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
362 struct syscall_arg *arg);
364 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
366 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
368 return scnprintf(bf, size, "%#lx", arg->val);
371 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
373 return scnprintf(bf, size, "%d", arg->val);
376 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
378 return scnprintf(bf, size, "%ld", arg->val);
381 static const char *bpf_cmd[] = {
382 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
383 "MAP_GET_NEXT_KEY", "PROG_LOAD",
385 static DEFINE_STRARRAY(bpf_cmd);
387 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
388 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
390 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
391 static DEFINE_STRARRAY(itimers);
393 static const char *keyctl_options[] = {
394 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
395 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
396 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
397 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
398 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
400 static DEFINE_STRARRAY(keyctl_options);
402 static const char *whences[] = { "SET", "CUR", "END",
410 static DEFINE_STRARRAY(whences);
412 static const char *fcntl_cmds[] = {
413 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
414 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
415 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
418 static DEFINE_STRARRAY(fcntl_cmds);
420 static const char *fcntl_linux_specific_cmds[] = {
421 "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
422 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
423 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
426 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
428 static struct strarray *fcntl_cmds_arrays[] = {
429 &strarray__fcntl_cmds,
430 &strarray__fcntl_linux_specific_cmds,
433 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
435 static const char *rlimit_resources[] = {
436 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
437 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
440 static DEFINE_STRARRAY(rlimit_resources);
442 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
443 static DEFINE_STRARRAY(sighow);
445 static const char *clockid[] = {
446 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
447 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
448 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
450 static DEFINE_STRARRAY(clockid);
452 static const char *socket_families[] = {
453 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
454 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
455 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
456 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
457 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
458 "ALG", "NFC", "VSOCK",
460 static DEFINE_STRARRAY(socket_families);
462 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
463 struct syscall_arg *arg)
468 if (mode == F_OK) /* 0 */
469 return scnprintf(bf, size, "F");
471 if (mode & n##_OK) { \
472 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
482 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
487 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
489 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
490 struct syscall_arg *arg);
492 #define SCA_FILENAME syscall_arg__scnprintf_filename
494 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
495 struct syscall_arg *arg)
497 int printed = 0, flags = arg->val;
500 if (flags & O_##n) { \
501 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
510 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
515 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
517 #ifndef GRND_NONBLOCK
518 #define GRND_NONBLOCK 0x0001
521 #define GRND_RANDOM 0x0002
524 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
525 struct syscall_arg *arg)
527 int printed = 0, flags = arg->val;
530 if (flags & GRND_##n) { \
531 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
532 flags &= ~GRND_##n; \
540 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
545 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
547 #define STRARRAY(name, array) \
548 { .scnprintf = SCA_STRARRAY, \
549 .parm = &strarray__##array, }
551 #include "trace/beauty/arch_errno_names.c"
552 #include "trace/beauty/eventfd.c"
553 #include "trace/beauty/futex_op.c"
554 #include "trace/beauty/futex_val3.c"
555 #include "trace/beauty/mmap.c"
556 #include "trace/beauty/mode_t.c"
557 #include "trace/beauty/msg_flags.c"
558 #include "trace/beauty/open_flags.c"
559 #include "trace/beauty/perf_event_open.c"
560 #include "trace/beauty/pid.c"
561 #include "trace/beauty/sched_policy.c"
562 #include "trace/beauty/seccomp.c"
563 #include "trace/beauty/signum.c"
564 #include "trace/beauty/socket_type.c"
565 #include "trace/beauty/waitid_options.c"
567 struct syscall_arg_fmt {
568 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
574 static struct syscall_fmt {
577 struct syscall_arg_fmt arg[6];
584 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
586 .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
587 { .name = "brk", .hexret = true,
588 .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
589 { .name = "clock_gettime",
590 .arg = { [0] = STRARRAY(clk_id, clockid), }, },
591 { .name = "clone", .errpid = true, .nr_args = 5,
592 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, },
593 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, },
594 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
595 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, },
596 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, },
598 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
599 { .name = "epoll_ctl",
600 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
601 { .name = "eventfd2",
602 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
603 { .name = "fchmodat",
604 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
605 { .name = "fchownat",
606 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
608 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
609 .parm = &strarrays__fcntl_cmds_arrays,
610 .show_zero = true, },
611 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, },
613 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
614 { .name = "fstat", .alias = "newfstat", },
615 { .name = "fstatat", .alias = "newfstatat", },
617 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
618 [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
619 { .name = "futimesat",
620 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
621 { .name = "getitimer",
622 .arg = { [0] = STRARRAY(which, itimers), }, },
623 { .name = "getpid", .errpid = true, },
624 { .name = "getpgid", .errpid = true, },
625 { .name = "getppid", .errpid = true, },
626 { .name = "getrandom",
627 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
628 { .name = "getrlimit",
629 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
630 { .name = "gettid", .errpid = true, },
633 #if defined(__i386__) || defined(__x86_64__)
635 * FIXME: Make this available to all arches.
637 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
638 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
640 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
642 { .name = "kcmp", .nr_args = 5,
643 .arg = { [0] = { .name = "pid1", .scnprintf = SCA_PID, },
644 [1] = { .name = "pid2", .scnprintf = SCA_PID, },
645 [2] = { .name = "type", .scnprintf = SCA_KCMP_TYPE, },
646 [3] = { .name = "idx1", .scnprintf = SCA_KCMP_IDX, },
647 [4] = { .name = "idx2", .scnprintf = SCA_KCMP_IDX, }, }, },
649 .arg = { [0] = STRARRAY(option, keyctl_options), }, },
651 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
653 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
655 .arg = { [2] = STRARRAY(whence, whences), }, },
656 { .name = "lstat", .alias = "newlstat", },
658 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
659 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
661 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
663 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
665 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
666 { .name = "mlockall",
667 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
668 { .name = "mmap", .hexret = true,
669 /* The standard mmap maps to old_mmap on s390x */
670 #if defined(__s390x__)
673 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
674 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
675 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
676 { .name = "mprotect",
677 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
678 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, },
679 { .name = "mq_unlink",
680 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
681 { .name = "mremap", .hexret = true,
682 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
683 [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
684 [4] = { .scnprintf = SCA_HEX, /* new_addr */ }, }, },
686 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
688 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
689 { .name = "name_to_handle_at",
690 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
691 { .name = "newfstatat",
692 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
694 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
695 { .name = "open_by_handle_at",
696 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
697 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
699 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
700 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
701 { .name = "perf_event_open",
702 .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ },
703 [3] = { .scnprintf = SCA_FD, /* group_fd */ },
704 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
706 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
707 { .name = "pkey_alloc",
708 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, },
709 { .name = "pkey_free",
710 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, },
711 { .name = "pkey_mprotect",
712 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
713 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
714 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, },
715 { .name = "poll", .timeout = true, },
716 { .name = "ppoll", .timeout = true, },
717 { .name = "prctl", .alias = "arch_prctl",
718 .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
719 [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
720 [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
721 { .name = "pread", .alias = "pread64", },
722 { .name = "preadv", .alias = "pread", },
723 { .name = "prlimit64",
724 .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
725 { .name = "pwrite", .alias = "pwrite64", },
726 { .name = "readlinkat",
727 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
728 { .name = "recvfrom",
729 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
730 { .name = "recvmmsg",
731 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
733 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
734 { .name = "renameat",
735 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
736 { .name = "rt_sigaction",
737 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
738 { .name = "rt_sigprocmask",
739 .arg = { [0] = STRARRAY(how, sighow), }, },
740 { .name = "rt_sigqueueinfo",
741 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
742 { .name = "rt_tgsigqueueinfo",
743 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
744 { .name = "sched_setscheduler",
745 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
747 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ },
748 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
749 { .name = "select", .timeout = true, },
750 { .name = "sendmmsg",
751 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
753 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
755 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
756 { .name = "set_tid_address", .errpid = true, },
757 { .name = "setitimer",
758 .arg = { [0] = STRARRAY(which, itimers), }, },
759 { .name = "setrlimit",
760 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
762 .arg = { [0] = STRARRAY(family, socket_families),
763 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
764 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
765 { .name = "socketpair",
766 .arg = { [0] = STRARRAY(family, socket_families),
767 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
768 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
769 { .name = "stat", .alias = "newstat", },
771 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ },
772 [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
773 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, },
775 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
777 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
778 { .name = "symlinkat",
779 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
781 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
783 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
784 { .name = "uname", .alias = "newuname", },
785 { .name = "unlinkat",
786 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
787 { .name = "utimensat",
788 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
789 { .name = "wait4", .errpid = true,
790 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
791 { .name = "waitid", .errpid = true,
792 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
795 static int syscall_fmt__cmp(const void *name, const void *fmtp)
797 const struct syscall_fmt *fmt = fmtp;
798 return strcmp(name, fmt->name);
801 static struct syscall_fmt *syscall_fmt__find(const char *name)
803 const int nmemb = ARRAY_SIZE(syscall_fmts);
804 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
808 * is_exit: is this "exit" or "exit_group"?
809 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
812 struct event_format *tp_format;
816 struct format_field *args;
818 struct syscall_fmt *fmt;
819 struct syscall_arg_fmt *arg_fmt;
823 * We need to have this 'calculated' boolean because in some cases we really
824 * don't know what is the duration of a syscall, for instance, when we start
825 * a session and some threads are waiting for a syscall to finish, say 'poll',
826 * in which case all we can do is to print "( ? ) for duration and for the
829 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
831 double duration = (double)t / NSEC_PER_MSEC;
832 size_t printed = fprintf(fp, "(");
835 printed += fprintf(fp, " ");
836 else if (duration >= 1.0)
837 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
838 else if (duration >= 0.01)
839 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
841 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
842 return printed + fprintf(fp, "): ");
846 * filename.ptr: The filename char pointer that will be vfs_getname'd
847 * filename.entry_str_pos: Where to insert the string translated from
848 * filename.ptr by the vfs_getname tracepoint/kprobe.
849 * ret_scnprintf: syscall args may set this to a different syscall return
850 * formatter, for instance, fcntl may return fds, file flags, etc.
852 struct thread_trace {
855 unsigned long nr_events;
856 unsigned long pfmaj, pfmin;
859 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
862 short int entry_str_pos;
864 unsigned int namelen;
872 struct intlist *syscall_stats;
875 static struct thread_trace *thread_trace__new(void)
877 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
880 ttrace->paths.max = -1;
882 ttrace->syscall_stats = intlist__new(NULL);
887 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
889 struct thread_trace *ttrace;
894 if (thread__priv(thread) == NULL)
895 thread__set_priv(thread, thread_trace__new());
897 if (thread__priv(thread) == NULL)
900 ttrace = thread__priv(thread);
905 color_fprintf(fp, PERF_COLOR_RED,
906 "WARNING: not enough memory, dropping samples!\n");
911 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
912 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
914 struct thread_trace *ttrace = thread__priv(arg->thread);
916 ttrace->ret_scnprintf = ret_scnprintf;
919 #define TRACE_PFMAJ (1 << 0)
920 #define TRACE_PFMIN (1 << 1)
922 static const size_t trace__entry_str_size = 2048;
924 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
926 struct thread_trace *ttrace = thread__priv(thread);
928 if (fd > ttrace->paths.max) {
929 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
934 if (ttrace->paths.max != -1) {
935 memset(npath + ttrace->paths.max + 1, 0,
936 (fd - ttrace->paths.max) * sizeof(char *));
938 memset(npath, 0, (fd + 1) * sizeof(char *));
941 ttrace->paths.table = npath;
942 ttrace->paths.max = fd;
945 ttrace->paths.table[fd] = strdup(pathname);
947 return ttrace->paths.table[fd] != NULL ? 0 : -1;
950 static int thread__read_fd_path(struct thread *thread, int fd)
952 char linkname[PATH_MAX], pathname[PATH_MAX];
956 if (thread->pid_ == thread->tid) {
957 scnprintf(linkname, sizeof(linkname),
958 "/proc/%d/fd/%d", thread->pid_, fd);
960 scnprintf(linkname, sizeof(linkname),
961 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
964 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
967 ret = readlink(linkname, pathname, sizeof(pathname));
969 if (ret < 0 || ret > st.st_size)
972 pathname[ret] = '\0';
973 return trace__set_fd_pathname(thread, fd, pathname);
976 static const char *thread__fd_path(struct thread *thread, int fd,
979 struct thread_trace *ttrace = thread__priv(thread);
987 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
990 ++trace->stats.proc_getname;
991 if (thread__read_fd_path(thread, fd))
995 return ttrace->paths.table[fd];
998 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1001 size_t printed = scnprintf(bf, size, "%d", fd);
1002 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1005 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1010 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1012 size_t printed = scnprintf(bf, size, "%d", fd);
1013 struct thread *thread = machine__find_thread(trace->host, pid, pid);
1016 const char *path = thread__fd_path(thread, fd, trace);
1019 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1021 thread__put(thread);
1027 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1028 struct syscall_arg *arg)
1031 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1032 struct thread_trace *ttrace = thread__priv(arg->thread);
1034 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1035 zfree(&ttrace->paths.table[fd]);
1040 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1043 struct thread_trace *ttrace = thread__priv(thread);
1045 ttrace->filename.ptr = ptr;
1046 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1049 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1050 struct syscall_arg *arg)
1052 unsigned long ptr = arg->val;
1054 if (!arg->trace->vfs_getname)
1055 return scnprintf(bf, size, "%#x", ptr);
1057 thread__set_filename_pos(arg->thread, bf, ptr);
1061 static bool trace__filter_duration(struct trace *trace, double t)
1063 return t < (trace->duration_filter * NSEC_PER_MSEC);
1066 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1068 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1070 return fprintf(fp, "%10.3f ", ts);
1074 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1075 * using ttrace->entry_time for a thread that receives a sys_exit without
1076 * first having received a sys_enter ("poll" issued before tracing session
1077 * starts, lost sys_enter exit due to ring buffer overflow).
1079 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1082 return __trace__fprintf_tstamp(trace, tstamp, fp);
1084 return fprintf(fp, " ? ");
1087 static bool done = false;
1088 static bool interrupted = false;
1090 static void sig_handler(int sig)
1093 interrupted = sig == SIGINT;
1096 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1097 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1099 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1100 printed += fprintf_duration(duration, duration_calculated, fp);
1102 if (trace->multiple_threads) {
1103 if (trace->show_comm)
1104 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1105 printed += fprintf(fp, "%d ", thread->tid);
1111 static int trace__process_event(struct trace *trace, struct machine *machine,
1112 union perf_event *event, struct perf_sample *sample)
1116 switch (event->header.type) {
1117 case PERF_RECORD_LOST:
1118 color_fprintf(trace->output, PERF_COLOR_RED,
1119 "LOST %" PRIu64 " events!\n", event->lost.lost);
1120 ret = machine__process_lost_event(machine, event, sample);
1123 ret = machine__process_event(machine, event, sample);
1130 static int trace__tool_process(struct perf_tool *tool,
1131 union perf_event *event,
1132 struct perf_sample *sample,
1133 struct machine *machine)
1135 struct trace *trace = container_of(tool, struct trace, tool);
1136 return trace__process_event(trace, machine, event, sample);
1139 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1141 struct machine *machine = vmachine;
1143 if (machine->kptr_restrict_warned)
1146 if (symbol_conf.kptr_restrict) {
1147 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1148 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1149 "Kernel samples will not be resolved.\n");
1150 machine->kptr_restrict_warned = true;
1154 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1157 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1159 int err = symbol__init(NULL);
1164 trace->host = machine__new_host();
1165 if (trace->host == NULL)
1168 err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1172 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1173 evlist->threads, trace__tool_process, false,
1174 trace->opts.proc_map_timeout, 1);
1182 static void trace__symbols__exit(struct trace *trace)
1184 machine__exit(trace->host);
1190 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1194 if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1195 nr_args = sc->fmt->nr_args;
1197 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1198 if (sc->arg_fmt == NULL)
1201 for (idx = 0; idx < nr_args; ++idx) {
1203 sc->arg_fmt[idx] = sc->fmt->arg[idx];
1206 sc->nr_args = nr_args;
1210 static int syscall__set_arg_fmts(struct syscall *sc)
1212 struct format_field *field;
1215 for (field = sc->args; field; field = field->next, ++idx) {
1216 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1219 if (strcmp(field->type, "const char *") == 0 &&
1220 (strcmp(field->name, "filename") == 0 ||
1221 strcmp(field->name, "path") == 0 ||
1222 strcmp(field->name, "pathname") == 0))
1223 sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1224 else if (field->flags & FIELD_IS_POINTER)
1225 sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1226 else if (strcmp(field->type, "pid_t") == 0)
1227 sc->arg_fmt[idx].scnprintf = SCA_PID;
1228 else if (strcmp(field->type, "umode_t") == 0)
1229 sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1230 else if ((strcmp(field->type, "int") == 0 ||
1231 strcmp(field->type, "unsigned int") == 0 ||
1232 strcmp(field->type, "long") == 0) &&
1233 (len = strlen(field->name)) >= 2 &&
1234 strcmp(field->name + len - 2, "fd") == 0) {
1236 * /sys/kernel/tracing/events/syscalls/sys_enter*
1237 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1242 sc->arg_fmt[idx].scnprintf = SCA_FD;
1249 static int trace__read_syscall_info(struct trace *trace, int id)
1253 const char *name = syscalltbl__name(trace->sctbl, id);
1258 if (id > trace->syscalls.max) {
1259 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1261 if (nsyscalls == NULL)
1264 if (trace->syscalls.max != -1) {
1265 memset(nsyscalls + trace->syscalls.max + 1, 0,
1266 (id - trace->syscalls.max) * sizeof(*sc));
1268 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1271 trace->syscalls.table = nsyscalls;
1272 trace->syscalls.max = id;
1275 sc = trace->syscalls.table + id;
1278 sc->fmt = syscall_fmt__find(sc->name);
1280 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1281 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1283 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1284 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1285 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1288 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1291 if (IS_ERR(sc->tp_format))
1294 sc->args = sc->tp_format->format.fields;
1296 * We need to check and discard the first variable '__syscall_nr'
1297 * or 'nr' that mean the syscall number. It is needless here.
1298 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1300 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1301 sc->args = sc->args->next;
1305 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1306 sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1308 return syscall__set_arg_fmts(sc);
1311 static int trace__validate_ev_qualifier(struct trace *trace)
1314 size_t nr_allocated;
1315 struct str_node *pos;
1317 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1318 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1319 sizeof(trace->ev_qualifier_ids.entries[0]));
1321 if (trace->ev_qualifier_ids.entries == NULL) {
1322 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1328 nr_allocated = trace->ev_qualifier_ids.nr;
1331 strlist__for_each_entry(pos, trace->ev_qualifier) {
1332 const char *sc = pos->s;
1333 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1336 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1341 fputs("Error:\tInvalid syscall ", trace->output);
1344 fputs(", ", trace->output);
1347 fputs(sc, trace->output);
1350 trace->ev_qualifier_ids.entries[i++] = id;
1351 if (match_next == -1)
1355 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1358 if (nr_allocated == trace->ev_qualifier_ids.nr) {
1362 entries = realloc(trace->ev_qualifier_ids.entries,
1363 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1364 if (entries == NULL) {
1366 fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1369 trace->ev_qualifier_ids.entries = entries;
1371 trace->ev_qualifier_ids.nr++;
1372 trace->ev_qualifier_ids.entries[i++] = id;
1377 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1378 "\nHint:\tand: 'man syscalls'\n", trace->output);
1380 zfree(&trace->ev_qualifier_ids.entries);
1381 trace->ev_qualifier_ids.nr = 0;
1388 * args is to be interpreted as a series of longs but we need to handle
1389 * 8-byte unaligned accesses. args points to raw_data within the event
1390 * and raw_data is guaranteed to be 8-byte unaligned because it is
1391 * preceded by raw_size which is a u32. So we need to copy args to a temp
1392 * variable to read it. Most notably this avoids extended load instructions
1393 * on unaligned addresses
1395 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1398 unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1400 memcpy(&val, p, sizeof(val));
1404 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1405 struct syscall_arg *arg)
1407 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1408 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1410 return scnprintf(bf, size, "arg%d: ", arg->idx);
1413 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1414 struct syscall_arg *arg, unsigned long val)
1416 if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1418 if (sc->arg_fmt[arg->idx].parm)
1419 arg->parm = sc->arg_fmt[arg->idx].parm;
1420 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1422 return scnprintf(bf, size, "%ld", val);
1425 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1426 unsigned char *args, struct trace *trace,
1427 struct thread *thread)
1432 struct syscall_arg arg = {
1439 struct thread_trace *ttrace = thread__priv(thread);
1442 * Things like fcntl will set this in its 'cmd' formatter to pick the
1443 * right formatter for the return value (an fd? file flags?), which is
1444 * not needed for syscalls that always return a given type, say an fd.
1446 ttrace->ret_scnprintf = NULL;
1448 if (sc->args != NULL) {
1449 struct format_field *field;
1451 for (field = sc->args; field;
1452 field = field->next, ++arg.idx, bit <<= 1) {
1456 val = syscall_arg__val(&arg, arg.idx);
1459 * Suppress this argument if its value is zero and
1460 * and we don't have a string associated in an
1465 (sc->arg_fmt[arg.idx].show_zero ||
1466 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1467 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1468 sc->arg_fmt[arg.idx].parm))
1471 printed += scnprintf(bf + printed, size - printed,
1472 "%s%s: ", printed ? ", " : "", field->name);
1473 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1475 } else if (IS_ERR(sc->tp_format)) {
1477 * If we managed to read the tracepoint /format file, then we
1478 * may end up not having any args, like with gettid(), so only
1479 * print the raw args when we didn't manage to read it.
1481 while (arg.idx < sc->nr_args) {
1484 val = syscall_arg__val(&arg, arg.idx);
1486 printed += scnprintf(bf + printed, size - printed, ", ");
1487 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1488 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1498 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1499 union perf_event *event,
1500 struct perf_sample *sample);
1502 static struct syscall *trace__syscall_info(struct trace *trace,
1503 struct perf_evsel *evsel, int id)
1509 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1510 * before that, leaving at a higher verbosity level till that is
1511 * explained. Reproduced with plain ftrace with:
1513 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1514 * grep "NR -1 " /t/trace_pipe
1516 * After generating some load on the machine.
1520 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1521 id, perf_evsel__name(evsel), ++n);
1526 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1527 trace__read_syscall_info(trace, id))
1530 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1533 return &trace->syscalls.table[id];
1537 fprintf(trace->output, "Problems reading syscall %d", id);
1538 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1539 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1540 fputs(" information\n", trace->output);
1545 static void thread__update_stats(struct thread_trace *ttrace,
1546 int id, struct perf_sample *sample)
1548 struct int_node *inode;
1549 struct stats *stats;
1552 inode = intlist__findnew(ttrace->syscall_stats, id);
1556 stats = inode->priv;
1557 if (stats == NULL) {
1558 stats = malloc(sizeof(struct stats));
1562 inode->priv = stats;
1565 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1566 duration = sample->time - ttrace->entry_time;
1568 update_stats(stats, duration);
1571 static int trace__printf_interrupted_entry(struct trace *trace)
1573 struct thread_trace *ttrace;
1576 if (trace->failure_only || trace->current == NULL)
1579 ttrace = thread__priv(trace->current);
1581 if (!ttrace->entry_pending)
1584 printed = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1585 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1586 ttrace->entry_pending = false;
1591 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1592 struct perf_sample *sample, struct thread *thread)
1596 if (trace->print_sample) {
1597 double ts = (double)sample->time / NSEC_PER_MSEC;
1599 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1600 perf_evsel__name(evsel), ts,
1601 thread__comm_str(thread),
1602 sample->pid, sample->tid, sample->cpu);
1608 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1609 union perf_event *event __maybe_unused,
1610 struct perf_sample *sample)
1615 struct thread *thread;
1616 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1617 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1618 struct thread_trace *ttrace;
1623 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1624 ttrace = thread__trace(thread, trace->output);
1628 trace__fprintf_sample(trace, evsel, sample, thread);
1630 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1632 if (ttrace->entry_str == NULL) {
1633 ttrace->entry_str = malloc(trace__entry_str_size);
1634 if (!ttrace->entry_str)
1638 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1639 trace__printf_interrupted_entry(trace);
1641 ttrace->entry_time = sample->time;
1642 msg = ttrace->entry_str;
1643 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1645 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1646 args, trace, thread);
1649 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1650 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1651 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1654 ttrace->entry_pending = true;
1655 /* See trace__vfs_getname & trace__sys_exit */
1656 ttrace->filename.pending_open = false;
1659 if (trace->current != thread) {
1660 thread__put(trace->current);
1661 trace->current = thread__get(thread);
1665 thread__put(thread);
1669 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1670 struct perf_sample *sample,
1671 struct callchain_cursor *cursor)
1673 struct addr_location al;
1674 int max_stack = evsel->attr.sample_max_stack ?
1675 evsel->attr.sample_max_stack :
1678 if (machine__resolve(trace->host, &al, sample) < 0 ||
1679 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1685 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1687 /* TODO: user-configurable print_opts */
1688 const unsigned int print_opts = EVSEL__PRINT_SYM |
1690 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1692 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1695 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1697 struct perf_env *env = perf_evsel__env(evsel);
1698 const char *arch_name = perf_env__arch(env);
1700 return arch_syscalls__strerrno(arch_name, err);
1703 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1704 union perf_event *event __maybe_unused,
1705 struct perf_sample *sample)
1709 bool duration_calculated = false;
1710 struct thread *thread;
1711 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1712 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1713 struct thread_trace *ttrace;
1718 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1719 ttrace = thread__trace(thread, trace->output);
1723 trace__fprintf_sample(trace, evsel, sample, thread);
1726 thread__update_stats(ttrace, id, sample);
1728 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1730 if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
1731 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1732 ttrace->filename.pending_open = false;
1733 ++trace->stats.vfs_getname;
1736 if (ttrace->entry_time) {
1737 duration = sample->time - ttrace->entry_time;
1738 if (trace__filter_duration(trace, duration))
1740 duration_calculated = true;
1741 } else if (trace->duration_filter)
1744 if (sample->callchain) {
1745 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1746 if (callchain_ret == 0) {
1747 if (callchain_cursor.nr < trace->min_stack)
1753 if (trace->summary_only || (ret >= 0 && trace->failure_only))
1756 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1758 if (ttrace->entry_pending) {
1759 fprintf(trace->output, "%-70s", ttrace->entry_str);
1761 fprintf(trace->output, " ... [");
1762 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1763 fprintf(trace->output, "]: %s()", sc->name);
1766 if (sc->fmt == NULL) {
1770 fprintf(trace->output, ") = %ld", ret);
1771 } else if (ret < 0) {
1773 char bf[STRERR_BUFSIZE];
1774 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1775 *e = errno_to_name(evsel, -ret);
1777 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1779 } else if (ret == 0 && sc->fmt->timeout)
1780 fprintf(trace->output, ") = 0 Timeout");
1781 else if (ttrace->ret_scnprintf) {
1783 struct syscall_arg arg = {
1788 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1789 ttrace->ret_scnprintf = NULL;
1790 fprintf(trace->output, ") = %s", bf);
1791 } else if (sc->fmt->hexret)
1792 fprintf(trace->output, ") = %#lx", ret);
1793 else if (sc->fmt->errpid) {
1794 struct thread *child = machine__find_thread(trace->host, ret, ret);
1796 if (child != NULL) {
1797 fprintf(trace->output, ") = %ld", ret);
1798 if (child->comm_set)
1799 fprintf(trace->output, " (%s)", thread__comm_str(child));
1805 fputc('\n', trace->output);
1807 if (callchain_ret > 0)
1808 trace__fprintf_callchain(trace, sample);
1809 else if (callchain_ret < 0)
1810 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1812 ttrace->entry_pending = false;
1815 thread__put(thread);
1819 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1820 union perf_event *event __maybe_unused,
1821 struct perf_sample *sample)
1823 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1824 struct thread_trace *ttrace;
1825 size_t filename_len, entry_str_len, to_move;
1826 ssize_t remaining_space;
1828 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1833 ttrace = thread__priv(thread);
1837 filename_len = strlen(filename);
1838 if (filename_len == 0)
1841 if (ttrace->filename.namelen < filename_len) {
1842 char *f = realloc(ttrace->filename.name, filename_len + 1);
1847 ttrace->filename.namelen = filename_len;
1848 ttrace->filename.name = f;
1851 strcpy(ttrace->filename.name, filename);
1852 ttrace->filename.pending_open = true;
1854 if (!ttrace->filename.ptr)
1857 entry_str_len = strlen(ttrace->entry_str);
1858 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1859 if (remaining_space <= 0)
1862 if (filename_len > (size_t)remaining_space) {
1863 filename += filename_len - remaining_space;
1864 filename_len = remaining_space;
1867 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1868 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1869 memmove(pos + filename_len, pos, to_move);
1870 memcpy(pos, filename, filename_len);
1872 ttrace->filename.ptr = 0;
1873 ttrace->filename.entry_str_pos = 0;
1875 thread__put(thread);
1880 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1881 union perf_event *event __maybe_unused,
1882 struct perf_sample *sample)
1884 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1885 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1886 struct thread *thread = machine__findnew_thread(trace->host,
1889 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1894 ttrace->runtime_ms += runtime_ms;
1895 trace->runtime_ms += runtime_ms;
1897 thread__put(thread);
1901 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1903 perf_evsel__strval(evsel, sample, "comm"),
1904 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1906 perf_evsel__intval(evsel, sample, "vruntime"));
1910 static int bpf_output__printer(enum binary_printer_ops op,
1911 unsigned int val, void *extra __maybe_unused, FILE *fp)
1913 unsigned char ch = (unsigned char)val;
1916 case BINARY_PRINT_CHAR_DATA:
1917 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1918 case BINARY_PRINT_DATA_BEGIN:
1919 case BINARY_PRINT_LINE_BEGIN:
1920 case BINARY_PRINT_ADDR:
1921 case BINARY_PRINT_NUM_DATA:
1922 case BINARY_PRINT_NUM_PAD:
1923 case BINARY_PRINT_SEP:
1924 case BINARY_PRINT_CHAR_PAD:
1925 case BINARY_PRINT_LINE_END:
1926 case BINARY_PRINT_DATA_END:
1934 static void bpf_output__fprintf(struct trace *trace,
1935 struct perf_sample *sample)
1937 binary__fprintf(sample->raw_data, sample->raw_size, 8,
1938 bpf_output__printer, NULL, trace->output);
1941 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1942 union perf_event *event __maybe_unused,
1943 struct perf_sample *sample)
1945 int callchain_ret = 0;
1947 if (sample->callchain) {
1948 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1949 if (callchain_ret == 0) {
1950 if (callchain_cursor.nr < trace->min_stack)
1956 trace__printf_interrupted_entry(trace);
1957 trace__fprintf_tstamp(trace, sample->time, trace->output);
1959 if (trace->trace_syscalls)
1960 fprintf(trace->output, "( ): ");
1962 fprintf(trace->output, "%s:", evsel->name);
1964 if (perf_evsel__is_bpf_output(evsel)) {
1965 bpf_output__fprintf(trace, sample);
1966 } else if (evsel->tp_format) {
1967 event_format__fprintf(evsel->tp_format, sample->cpu,
1968 sample->raw_data, sample->raw_size,
1972 fprintf(trace->output, "\n");
1974 if (callchain_ret > 0)
1975 trace__fprintf_callchain(trace, sample);
1976 else if (callchain_ret < 0)
1977 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1982 static void print_location(FILE *f, struct perf_sample *sample,
1983 struct addr_location *al,
1984 bool print_dso, bool print_sym)
1987 if ((verbose > 0 || print_dso) && al->map)
1988 fprintf(f, "%s@", al->map->dso->long_name);
1990 if ((verbose > 0 || print_sym) && al->sym)
1991 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1992 al->addr - al->sym->start);
1994 fprintf(f, "0x%" PRIx64, al->addr);
1996 fprintf(f, "0x%" PRIx64, sample->addr);
1999 static int trace__pgfault(struct trace *trace,
2000 struct perf_evsel *evsel,
2001 union perf_event *event __maybe_unused,
2002 struct perf_sample *sample)
2004 struct thread *thread;
2005 struct addr_location al;
2006 char map_type = 'd';
2007 struct thread_trace *ttrace;
2009 int callchain_ret = 0;
2011 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2013 if (sample->callchain) {
2014 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2015 if (callchain_ret == 0) {
2016 if (callchain_cursor.nr < trace->min_stack)
2022 ttrace = thread__trace(thread, trace->output);
2026 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2031 if (trace->summary_only)
2034 thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2036 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2038 fprintf(trace->output, "%sfault [",
2039 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2042 print_location(trace->output, sample, &al, false, true);
2044 fprintf(trace->output, "] => ");
2046 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2049 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2057 print_location(trace->output, sample, &al, true, false);
2059 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2061 if (callchain_ret > 0)
2062 trace__fprintf_callchain(trace, sample);
2063 else if (callchain_ret < 0)
2064 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2068 thread__put(thread);
2072 static void trace__set_base_time(struct trace *trace,
2073 struct perf_evsel *evsel,
2074 struct perf_sample *sample)
2077 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2078 * and don't use sample->time unconditionally, we may end up having
2079 * some other event in the future without PERF_SAMPLE_TIME for good
2080 * reason, i.e. we may not be interested in its timestamps, just in
2081 * it taking place, picking some piece of information when it
2082 * appears in our event stream (vfs_getname comes to mind).
2084 if (trace->base_time == 0 && !trace->full_time &&
2085 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2086 trace->base_time = sample->time;
2089 static int trace__process_sample(struct perf_tool *tool,
2090 union perf_event *event,
2091 struct perf_sample *sample,
2092 struct perf_evsel *evsel,
2093 struct machine *machine __maybe_unused)
2095 struct trace *trace = container_of(tool, struct trace, tool);
2096 struct thread *thread;
2099 tracepoint_handler handler = evsel->handler;
2101 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2102 if (thread && thread__is_filtered(thread))
2105 trace__set_base_time(trace, evsel, sample);
2109 handler(trace, evsel, event, sample);
2112 thread__put(thread);
2116 static int trace__record(struct trace *trace, int argc, const char **argv)
2118 unsigned int rec_argc, i, j;
2119 const char **rec_argv;
2120 const char * const record_args[] = {
2127 const char * const sc_args[] = { "-e", };
2128 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2129 const char * const majpf_args[] = { "-e", "major-faults" };
2130 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2131 const char * const minpf_args[] = { "-e", "minor-faults" };
2132 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2134 /* +1 is for the event string below */
2135 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2136 majpf_args_nr + minpf_args_nr + argc;
2137 rec_argv = calloc(rec_argc + 1, sizeof(char *));
2139 if (rec_argv == NULL)
2143 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2144 rec_argv[j++] = record_args[i];
2146 if (trace->trace_syscalls) {
2147 for (i = 0; i < sc_args_nr; i++)
2148 rec_argv[j++] = sc_args[i];
2150 /* event string may be different for older kernels - e.g., RHEL6 */
2151 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2152 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2153 else if (is_valid_tracepoint("syscalls:sys_enter"))
2154 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2156 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2162 if (trace->trace_pgfaults & TRACE_PFMAJ)
2163 for (i = 0; i < majpf_args_nr; i++)
2164 rec_argv[j++] = majpf_args[i];
2166 if (trace->trace_pgfaults & TRACE_PFMIN)
2167 for (i = 0; i < minpf_args_nr; i++)
2168 rec_argv[j++] = minpf_args[i];
2170 for (i = 0; i < (unsigned int)argc; i++)
2171 rec_argv[j++] = argv[i];
2173 return cmd_record(j, rec_argv);
2176 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2178 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2180 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2185 if (perf_evsel__field(evsel, "pathname") == NULL) {
2186 perf_evsel__delete(evsel);
2190 evsel->handler = trace__vfs_getname;
2191 perf_evlist__add(evlist, evsel);
2195 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2197 struct perf_evsel *evsel;
2198 struct perf_event_attr attr = {
2199 .type = PERF_TYPE_SOFTWARE,
2203 attr.config = config;
2204 attr.sample_period = 1;
2206 event_attr_init(&attr);
2208 evsel = perf_evsel__new(&attr);
2210 evsel->handler = trace__pgfault;
2215 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2217 const u32 type = event->header.type;
2218 struct perf_evsel *evsel;
2220 if (type != PERF_RECORD_SAMPLE) {
2221 trace__process_event(trace, trace->host, event, sample);
2225 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2226 if (evsel == NULL) {
2227 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2231 trace__set_base_time(trace, evsel, sample);
2233 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2234 sample->raw_data == NULL) {
2235 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2236 perf_evsel__name(evsel), sample->tid,
2237 sample->cpu, sample->raw_size);
2239 tracepoint_handler handler = evsel->handler;
2240 handler(trace, evsel, event, sample);
2244 static int trace__add_syscall_newtp(struct trace *trace)
2247 struct perf_evlist *evlist = trace->evlist;
2248 struct perf_evsel *sys_enter, *sys_exit;
2250 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2251 if (sys_enter == NULL)
2254 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2255 goto out_delete_sys_enter;
2257 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2258 if (sys_exit == NULL)
2259 goto out_delete_sys_enter;
2261 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2262 goto out_delete_sys_exit;
2264 perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2265 perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2267 perf_evlist__add(evlist, sys_enter);
2268 perf_evlist__add(evlist, sys_exit);
2270 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2272 * We're interested only in the user space callchain
2273 * leading to the syscall, allow overriding that for
2274 * debugging reasons using --kernel_syscall_callchains
2276 sys_exit->attr.exclude_callchain_kernel = 1;
2279 trace->syscalls.events.sys_enter = sys_enter;
2280 trace->syscalls.events.sys_exit = sys_exit;
2286 out_delete_sys_exit:
2287 perf_evsel__delete_priv(sys_exit);
2288 out_delete_sys_enter:
2289 perf_evsel__delete_priv(sys_enter);
2293 static int trace__set_ev_qualifier_filter(struct trace *trace)
2296 struct perf_evsel *sys_exit;
2297 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2298 trace->ev_qualifier_ids.nr,
2299 trace->ev_qualifier_ids.entries);
2304 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2306 sys_exit = trace->syscalls.events.sys_exit;
2307 err = perf_evsel__append_tp_filter(sys_exit, filter);
2318 static int trace__set_filter_loop_pids(struct trace *trace)
2320 unsigned int nr = 1;
2324 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2326 while (thread && nr < ARRAY_SIZE(pids)) {
2327 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2332 if (!strcmp(thread__comm_str(parent), "sshd")) {
2333 pids[nr++] = parent->tid;
2339 return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2342 static int trace__run(struct trace *trace, int argc, const char **argv)
2344 struct perf_evlist *evlist = trace->evlist;
2345 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2347 unsigned long before;
2348 const bool forks = argc > 0;
2349 bool draining = false;
2353 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2354 goto out_error_raw_syscalls;
2356 if (trace->trace_syscalls)
2357 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2359 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2360 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2361 if (pgfault_maj == NULL)
2363 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2364 perf_evlist__add(evlist, pgfault_maj);
2367 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2368 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2369 if (pgfault_min == NULL)
2371 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2372 perf_evlist__add(evlist, pgfault_min);
2376 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2377 trace__sched_stat_runtime))
2378 goto out_error_sched_stat_runtime;
2381 * If a global cgroup was set, apply it to all the events without an
2382 * explicit cgroup. I.e.:
2384 * trace -G A -e sched:*switch
2386 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2387 * _and_ sched:sched_switch to the 'A' cgroup, while:
2389 * trace -e sched:*switch -G A
2391 * will only set the sched:sched_switch event to the 'A' cgroup, all the
2392 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2393 * a cgroup (on the root cgroup, sys wide, etc).
2397 * trace -G A -e sched:*switch -G B
2399 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2400 * to the 'B' cgroup.
2402 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2403 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2406 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2408 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2410 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2411 goto out_delete_evlist;
2414 err = trace__symbols_init(trace, evlist);
2416 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2417 goto out_delete_evlist;
2420 perf_evlist__config(evlist, &trace->opts, &callchain_param);
2422 signal(SIGCHLD, sig_handler);
2423 signal(SIGINT, sig_handler);
2426 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2429 fprintf(trace->output, "Couldn't run the workload!\n");
2430 goto out_delete_evlist;
2434 err = perf_evlist__open(evlist);
2436 goto out_error_open;
2438 err = bpf__apply_obj_config();
2440 char errbuf[BUFSIZ];
2442 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2443 pr_err("ERROR: Apply config to BPF failed: %s\n",
2445 goto out_error_open;
2449 * Better not use !target__has_task() here because we need to cover the
2450 * case where no threads were specified in the command line, but a
2451 * workload was, and in that case we will fill in the thread_map when
2452 * we fork the workload in perf_evlist__prepare_workload.
2454 if (trace->filter_pids.nr > 0)
2455 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2456 else if (thread_map__pid(evlist->threads, 0) == -1)
2457 err = trace__set_filter_loop_pids(trace);
2462 if (trace->ev_qualifier_ids.nr > 0) {
2463 err = trace__set_ev_qualifier_filter(trace);
2467 pr_debug("event qualifier tracepoint filter: %s\n",
2468 trace->syscalls.events.sys_exit->filter);
2471 err = perf_evlist__apply_filters(evlist, &evsel);
2473 goto out_error_apply_filters;
2475 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2477 goto out_error_mmap;
2479 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2480 perf_evlist__enable(evlist);
2483 perf_evlist__start_workload(evlist);
2485 if (trace->opts.initial_delay) {
2486 usleep(trace->opts.initial_delay * 1000);
2487 perf_evlist__enable(evlist);
2490 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2491 evlist->threads->nr > 1 ||
2492 perf_evlist__first(evlist)->attr.inherit;
2495 * Now that we already used evsel->attr to ask the kernel to setup the
2496 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2497 * trace__resolve_callchain(), allowing per-event max-stack settings
2498 * to override an explicitely set --max-stack global setting.
2500 evlist__for_each_entry(evlist, evsel) {
2501 if (evsel__has_callchain(evsel) &&
2502 evsel->attr.sample_max_stack == 0)
2503 evsel->attr.sample_max_stack = trace->max_stack;
2506 before = trace->nr_events;
2508 for (i = 0; i < evlist->nr_mmaps; i++) {
2509 union perf_event *event;
2510 struct perf_mmap *md;
2512 md = &evlist->mmap[i];
2513 if (perf_mmap__read_init(md) < 0)
2516 while ((event = perf_mmap__read_event(md)) != NULL) {
2517 struct perf_sample sample;
2521 err = perf_evlist__parse_sample(evlist, event, &sample);
2523 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2527 trace__handle_event(trace, event, &sample);
2529 perf_mmap__consume(md);
2534 if (done && !draining) {
2535 perf_evlist__disable(evlist);
2539 perf_mmap__read_done(md);
2542 if (trace->nr_events == before) {
2543 int timeout = done ? 100 : -1;
2545 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2546 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2556 thread__zput(trace->current);
2558 perf_evlist__disable(evlist);
2562 trace__fprintf_thread_summary(trace, trace->output);
2564 if (trace->show_tool_stats) {
2565 fprintf(trace->output, "Stats:\n "
2566 " vfs_getname : %" PRIu64 "\n"
2567 " proc_getname: %" PRIu64 "\n",
2568 trace->stats.vfs_getname,
2569 trace->stats.proc_getname);
2574 trace__symbols__exit(trace);
2576 perf_evlist__delete(evlist);
2577 cgroup__put(trace->cgroup);
2578 trace->evlist = NULL;
2579 trace->live = false;
2582 char errbuf[BUFSIZ];
2584 out_error_sched_stat_runtime:
2585 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2588 out_error_raw_syscalls:
2589 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2593 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2597 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2600 fprintf(trace->output, "%s\n", errbuf);
2601 goto out_delete_evlist;
2603 out_error_apply_filters:
2604 fprintf(trace->output,
2605 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2606 evsel->filter, perf_evsel__name(evsel), errno,
2607 str_error_r(errno, errbuf, sizeof(errbuf)));
2608 goto out_delete_evlist;
2611 fprintf(trace->output, "Not enough memory to run!\n");
2612 goto out_delete_evlist;
2615 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2616 goto out_delete_evlist;
2619 static int trace__replay(struct trace *trace)
2621 const struct perf_evsel_str_handler handlers[] = {
2622 { "probe:vfs_getname", trace__vfs_getname, },
2624 struct perf_data data = {
2628 .mode = PERF_DATA_MODE_READ,
2629 .force = trace->force,
2631 struct perf_session *session;
2632 struct perf_evsel *evsel;
2635 trace->tool.sample = trace__process_sample;
2636 trace->tool.mmap = perf_event__process_mmap;
2637 trace->tool.mmap2 = perf_event__process_mmap2;
2638 trace->tool.comm = perf_event__process_comm;
2639 trace->tool.exit = perf_event__process_exit;
2640 trace->tool.fork = perf_event__process_fork;
2641 trace->tool.attr = perf_event__process_attr;
2642 trace->tool.tracing_data = perf_event__process_tracing_data;
2643 trace->tool.build_id = perf_event__process_build_id;
2644 trace->tool.namespaces = perf_event__process_namespaces;
2646 trace->tool.ordered_events = true;
2647 trace->tool.ordering_requires_timestamps = true;
2649 /* add tid to output */
2650 trace->multiple_threads = true;
2652 session = perf_session__new(&data, false, &trace->tool);
2653 if (session == NULL)
2656 if (trace->opts.target.pid)
2657 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2659 if (trace->opts.target.tid)
2660 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2662 if (symbol__init(&session->header.env) < 0)
2665 trace->host = &session->machines.host;
2667 err = perf_session__set_tracepoints_handlers(session, handlers);
2671 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2672 "raw_syscalls:sys_enter");
2673 /* older kernels have syscalls tp versus raw_syscalls */
2675 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2676 "syscalls:sys_enter");
2679 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2680 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2681 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2685 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2686 "raw_syscalls:sys_exit");
2688 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2689 "syscalls:sys_exit");
2691 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2692 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2693 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2697 evlist__for_each_entry(session->evlist, evsel) {
2698 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2699 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2700 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2701 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2702 evsel->handler = trace__pgfault;
2707 err = perf_session__process_events(session);
2709 pr_err("Failed to process events, error %d", err);
2711 else if (trace->summary)
2712 trace__fprintf_thread_summary(trace, trace->output);
2715 perf_session__delete(session);
2720 static size_t trace__fprintf_threads_header(FILE *fp)
2724 printed = fprintf(fp, "\n Summary of events:\n\n");
2729 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2730 struct stats *stats;
2735 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2736 struct stats *stats = source->priv;
2738 entry->syscall = source->i;
2739 entry->stats = stats;
2740 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2743 static size_t thread__dump_stats(struct thread_trace *ttrace,
2744 struct trace *trace, FILE *fp)
2749 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2751 if (syscall_stats == NULL)
2754 printed += fprintf(fp, "\n");
2756 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2757 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2758 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2760 resort_rb__for_each_entry(nd, syscall_stats) {
2761 struct stats *stats = syscall_stats_entry->stats;
2763 double min = (double)(stats->min) / NSEC_PER_MSEC;
2764 double max = (double)(stats->max) / NSEC_PER_MSEC;
2765 double avg = avg_stats(stats);
2767 u64 n = (u64) stats->n;
2769 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2770 avg /= NSEC_PER_MSEC;
2772 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2773 printed += fprintf(fp, " %-15s", sc->name);
2774 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2775 n, syscall_stats_entry->msecs, min, avg);
2776 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2780 resort_rb__delete(syscall_stats);
2781 printed += fprintf(fp, "\n\n");
2786 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2789 struct thread_trace *ttrace = thread__priv(thread);
2795 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2797 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2798 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2799 printed += fprintf(fp, "%.1f%%", ratio);
2801 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2803 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2805 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2806 else if (fputc('\n', fp) != EOF)
2809 printed += thread__dump_stats(ttrace, trace, fp);
2814 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2816 return ttrace ? ttrace->nr_events : 0;
2819 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2820 struct thread *thread;
2823 entry->thread = rb_entry(nd, struct thread, rb_node);
2826 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2828 size_t printed = trace__fprintf_threads_header(fp);
2832 for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2833 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2835 if (threads == NULL) {
2836 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2840 resort_rb__for_each_entry(nd, threads)
2841 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2843 resort_rb__delete(threads);
2848 static int trace__set_duration(const struct option *opt, const char *str,
2849 int unset __maybe_unused)
2851 struct trace *trace = opt->value;
2853 trace->duration_filter = atof(str);
2857 static int trace__set_filter_pids(const struct option *opt, const char *str,
2858 int unset __maybe_unused)
2862 struct trace *trace = opt->value;
2864 * FIXME: introduce a intarray class, plain parse csv and create a
2865 * { int nr, int entries[] } struct...
2867 struct intlist *list = intlist__new(str);
2872 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2873 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2875 if (trace->filter_pids.entries == NULL)
2878 trace->filter_pids.entries[0] = getpid();
2880 for (i = 1; i < trace->filter_pids.nr; ++i)
2881 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2883 intlist__delete(list);
2889 static int trace__open_output(struct trace *trace, const char *filename)
2893 if (!stat(filename, &st) && st.st_size) {
2894 char oldname[PATH_MAX];
2896 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2898 rename(filename, oldname);
2901 trace->output = fopen(filename, "w");
2903 return trace->output == NULL ? -errno : 0;
2906 static int parse_pagefaults(const struct option *opt, const char *str,
2907 int unset __maybe_unused)
2909 int *trace_pgfaults = opt->value;
2911 if (strcmp(str, "all") == 0)
2912 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2913 else if (strcmp(str, "maj") == 0)
2914 *trace_pgfaults |= TRACE_PFMAJ;
2915 else if (strcmp(str, "min") == 0)
2916 *trace_pgfaults |= TRACE_PFMIN;
2923 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2925 struct perf_evsel *evsel;
2927 evlist__for_each_entry(evlist, evsel)
2928 evsel->handler = handler;
2932 * XXX: Hackish, just splitting the combined -e+--event (syscalls
2933 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2934 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2936 * It'd be better to introduce a parse_options() variant that would return a
2937 * list with the terms it didn't match to an event...
2939 static int trace__parse_events_option(const struct option *opt, const char *str,
2940 int unset __maybe_unused)
2942 struct trace *trace = (struct trace *)opt->value;
2943 const char *s = str;
2944 char *sep = NULL, *lists[2] = { NULL, NULL, };
2945 int len = strlen(str) + 1, err = -1, list, idx;
2946 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2947 char group_name[PATH_MAX];
2949 if (strace_groups_dir == NULL)
2954 trace->not_ev_qualifier = true;
2958 if ((sep = strchr(s, ',')) != NULL)
2962 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2963 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2966 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2967 if (access(group_name, R_OK) == 0)
2972 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2974 lists[list] = malloc(len);
2975 if (lists[list] == NULL)
2977 strcpy(lists[list], s);
2987 if (lists[1] != NULL) {
2988 struct strlist_config slist_config = {
2989 .dirname = strace_groups_dir,
2992 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2993 if (trace->ev_qualifier == NULL) {
2994 fputs("Not enough memory to parse event qualifier", trace->output);
2998 if (trace__validate_ev_qualifier(trace))
3000 trace->trace_syscalls = true;
3006 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3007 "event selector. use 'perf list' to list available events",
3008 parse_events_option);
3009 err = parse_events_option(&o, lists[0], 0);
3018 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3020 struct trace *trace = opt->value;
3022 if (!list_empty(&trace->evlist->entries))
3023 return parse_cgroups(opt, str, unset);
3025 trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3030 int cmd_trace(int argc, const char **argv)
3032 const char *trace_usage[] = {
3033 "perf trace [<options>] [<command>]",
3034 "perf trace [<options>] -- <command> [<options>]",
3035 "perf trace record [<options>] [<command>]",
3036 "perf trace record [<options>] -- <command> [<options>]",
3039 struct trace trace = {
3048 .user_freq = UINT_MAX,
3049 .user_interval = ULLONG_MAX,
3050 .no_buffering = true,
3051 .mmap_pages = UINT_MAX,
3052 .proc_map_timeout = 500,
3056 .trace_syscalls = false,
3057 .kernel_syscallchains = false,
3058 .max_stack = UINT_MAX,
3060 const char *output_name = NULL;
3061 const struct option trace_options[] = {
3062 OPT_CALLBACK('e', "event", &trace, "event",
3063 "event/syscall selector. use 'perf list' to list available events",
3064 trace__parse_events_option),
3065 OPT_BOOLEAN(0, "comm", &trace.show_comm,
3066 "show the thread COMM next to its id"),
3067 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3068 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3069 trace__parse_events_option),
3070 OPT_STRING('o', "output", &output_name, "file", "output file name"),
3071 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3072 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3073 "trace events on existing process id"),
3074 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3075 "trace events on existing thread id"),
3076 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3077 "pids to filter (by the kernel)", trace__set_filter_pids),
3078 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3079 "system-wide collection from all CPUs"),
3080 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3081 "list of cpus to monitor"),
3082 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3083 "child tasks do not inherit counters"),
3084 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3085 "number of mmap data pages",
3086 perf_evlist__parse_mmap_pages),
3087 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3089 OPT_CALLBACK(0, "duration", &trace, "float",
3090 "show only events with duration > N.M ms",
3091 trace__set_duration),
3092 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3093 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3094 OPT_BOOLEAN('T', "time", &trace.full_time,
3095 "Show full timestamp, not time relative to first start"),
3096 OPT_BOOLEAN(0, "failure", &trace.failure_only,
3097 "Show only syscalls that failed"),
3098 OPT_BOOLEAN('s', "summary", &trace.summary_only,
3099 "Show only syscall summary with statistics"),
3100 OPT_BOOLEAN('S', "with-summary", &trace.summary,
3101 "Show all syscalls and summary with statistics"),
3102 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3103 "Trace pagefaults", parse_pagefaults, "maj"),
3104 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3105 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3106 OPT_CALLBACK(0, "call-graph", &trace.opts,
3107 "record_mode[,record_size]", record_callchain_help,
3108 &record_parse_callchain_opt),
3109 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3110 "Show the kernel callchains on the syscall exit path"),
3111 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3112 "Set the minimum stack depth when parsing the callchain, "
3113 "anything below the specified depth will be ignored."),
3114 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3115 "Set the maximum stack depth when parsing the callchain, "
3116 "anything beyond the specified depth will be ignored. "
3117 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3118 OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3119 "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3120 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3121 "per thread proc mmap processing timeout in ms"),
3122 OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3123 trace__parse_cgroups),
3124 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3125 "ms to wait before starting measurement after program "
3129 bool __maybe_unused max_stack_user_set = true;
3130 bool mmap_pages_user_set = true;
3131 const char * const trace_subcommands[] = { "record", NULL };
3135 signal(SIGSEGV, sighandler_dump_stack);
3136 signal(SIGFPE, sighandler_dump_stack);
3138 trace.evlist = perf_evlist__new();
3139 trace.sctbl = syscalltbl__new();
3141 if (trace.evlist == NULL || trace.sctbl == NULL) {
3142 pr_err("Not enough memory to run!\n");
3147 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3148 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3150 if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3151 usage_with_options_msg(trace_usage, trace_options,
3152 "cgroup monitoring only available in system-wide mode");
3155 err = bpf__setup_stdout(trace.evlist);
3157 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3158 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3164 if (trace.trace_pgfaults) {
3165 trace.opts.sample_address = true;
3166 trace.opts.sample_time = true;
3169 if (trace.opts.mmap_pages == UINT_MAX)
3170 mmap_pages_user_set = false;
3172 if (trace.max_stack == UINT_MAX) {
3173 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3174 max_stack_user_set = false;
3177 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3178 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3179 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3183 if (callchain_param.enabled) {
3184 if (!mmap_pages_user_set && geteuid() == 0)
3185 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3187 symbol_conf.use_callchain = true;
3190 if (trace.evlist->nr_entries > 0)
3191 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3193 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3194 return trace__record(&trace, argc-1, &argv[1]);
3196 /* summary_only implies summary option, but don't overwrite summary if set */
3197 if (trace.summary_only)
3198 trace.summary = trace.summary_only;
3200 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3201 trace.evlist->nr_entries == 0 /* Was --events used? */) {
3202 trace.trace_syscalls = true;
3205 if (output_name != NULL) {
3206 err = trace__open_output(&trace, output_name);
3208 perror("failed to create output file");
3213 err = target__validate(&trace.opts.target);
3215 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3216 fprintf(trace.output, "%s", bf);
3220 err = target__parse_uid(&trace.opts.target);
3222 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3223 fprintf(trace.output, "%s", bf);
3227 if (!argc && target__none(&trace.opts.target))
3228 trace.opts.target.system_wide = true;
3231 err = trace__replay(&trace);
3233 err = trace__run(&trace, argc, argv);
3236 if (output_name != NULL)
3237 fclose(trace.output);