4 * Builtin 'trace' command:
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
16 * Released under the GPL v2. (and only v2, not any later version)
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
62 #include "sane_ctype.h"
65 # define O_CLOEXEC 02000000
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE 1024
73 struct perf_tool tool;
74 struct syscalltbl *sctbl;
77 struct syscall *table;
79 struct perf_evsel *sys_enter,
83 struct record_opts opts;
84 struct perf_evlist *evlist;
86 struct thread *current;
87 struct cgroup *cgroup;
90 unsigned long nr_events;
91 struct strlist *ev_qualifier;
100 double duration_filter;
106 unsigned int max_stack;
107 unsigned int min_stack;
108 bool not_ev_qualifier;
112 bool multiple_threads;
118 bool show_tool_stats;
120 bool kernel_syscallchains;
129 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
130 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
134 #define TP_UINT_FIELD(bits) \
135 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
138 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
147 #define TP_UINT_FIELD__SWAPPED(bits) \
148 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
151 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
152 return bswap_##bits(value);\
155 TP_UINT_FIELD__SWAPPED(16);
156 TP_UINT_FIELD__SWAPPED(32);
157 TP_UINT_FIELD__SWAPPED(64);
159 static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
161 field->offset = offset;
165 field->integer = tp_field__u8;
168 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
171 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
174 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
183 static int tp_field__init_uint(struct tp_field *field, struct format_field *format_field, bool needs_swap)
185 return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
188 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
190 return sample->raw_data + field->offset;
193 static int __tp_field__init_ptr(struct tp_field *field, int offset)
195 field->offset = offset;
196 field->pointer = tp_field__ptr;
200 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
202 return __tp_field__init_ptr(field, format_field->offset);
208 struct tp_field args, ret;
212 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
213 struct tp_field *field,
216 struct format_field *format_field = perf_evsel__field(evsel, name);
218 if (format_field == NULL)
221 return tp_field__init_uint(field, format_field, evsel->needs_swap);
224 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
225 ({ struct syscall_tp *sc = evsel->priv;\
226 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
228 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
229 struct tp_field *field,
232 struct format_field *format_field = perf_evsel__field(evsel, name);
234 if (format_field == NULL)
237 return tp_field__init_ptr(field, format_field);
240 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
241 ({ struct syscall_tp *sc = evsel->priv;\
242 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
244 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
247 perf_evsel__delete(evsel);
250 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel)
252 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
254 if (evsel->priv != NULL) {
255 if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr"))
266 static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler)
268 evsel->priv = malloc(sizeof(struct syscall_tp));
269 if (evsel->priv != NULL) {
270 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
273 evsel->handler = handler;
284 static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
286 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
288 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
290 evsel = perf_evsel__newtp("syscalls", direction);
295 if (perf_evsel__init_raw_syscall_tp(evsel, handler))
301 perf_evsel__delete_priv(evsel);
305 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
306 ({ struct syscall_tp *fields = evsel->priv; \
307 fields->name.integer(&fields->name, sample); })
309 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
310 ({ struct syscall_tp *fields = evsel->priv; \
311 fields->name.pointer(&fields->name, sample); })
313 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
315 int idx = val - sa->offset;
317 if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL)
318 return scnprintf(bf, size, intfmt, val);
320 return scnprintf(bf, size, "%s", sa->entries[idx]);
323 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
325 struct syscall_arg *arg)
327 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
330 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
331 struct syscall_arg *arg)
333 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
336 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
340 struct strarray **entries;
343 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
344 .nr_entries = ARRAY_SIZE(array), \
348 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
349 struct syscall_arg *arg)
351 struct strarrays *sas = arg->parm;
354 for (i = 0; i < sas->nr_entries; ++i) {
355 struct strarray *sa = sas->entries[i];
356 int idx = arg->val - sa->offset;
358 if (idx >= 0 && idx < sa->nr_entries) {
359 if (sa->entries[idx] == NULL)
361 return scnprintf(bf, size, "%s", sa->entries[idx]);
365 return scnprintf(bf, size, "%d", arg->val);
369 #define AT_FDCWD -100
372 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
373 struct syscall_arg *arg)
378 return scnprintf(bf, size, "CWD");
380 return syscall_arg__scnprintf_fd(bf, size, arg);
383 #define SCA_FDAT syscall_arg__scnprintf_fd_at
385 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
386 struct syscall_arg *arg);
388 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
390 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
392 return scnprintf(bf, size, "%#lx", arg->val);
395 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
397 return scnprintf(bf, size, "%d", arg->val);
400 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
402 return scnprintf(bf, size, "%ld", arg->val);
405 static const char *bpf_cmd[] = {
406 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
407 "MAP_GET_NEXT_KEY", "PROG_LOAD",
409 static DEFINE_STRARRAY(bpf_cmd);
411 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
412 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
414 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
415 static DEFINE_STRARRAY(itimers);
417 static const char *keyctl_options[] = {
418 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
419 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
420 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
421 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
422 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
424 static DEFINE_STRARRAY(keyctl_options);
426 static const char *whences[] = { "SET", "CUR", "END",
434 static DEFINE_STRARRAY(whences);
436 static const char *fcntl_cmds[] = {
437 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
438 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
439 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
442 static DEFINE_STRARRAY(fcntl_cmds);
444 static const char *fcntl_linux_specific_cmds[] = {
445 "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
446 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
447 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
450 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
452 static struct strarray *fcntl_cmds_arrays[] = {
453 &strarray__fcntl_cmds,
454 &strarray__fcntl_linux_specific_cmds,
457 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
459 static const char *rlimit_resources[] = {
460 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
461 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
464 static DEFINE_STRARRAY(rlimit_resources);
466 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
467 static DEFINE_STRARRAY(sighow);
469 static const char *clockid[] = {
470 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
471 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
472 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
474 static DEFINE_STRARRAY(clockid);
476 static const char *socket_families[] = {
477 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
478 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
479 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
480 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
481 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
482 "ALG", "NFC", "VSOCK",
484 static DEFINE_STRARRAY(socket_families);
486 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
487 struct syscall_arg *arg)
492 if (mode == F_OK) /* 0 */
493 return scnprintf(bf, size, "F");
495 if (mode & n##_OK) { \
496 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
506 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
511 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
513 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
514 struct syscall_arg *arg);
516 #define SCA_FILENAME syscall_arg__scnprintf_filename
518 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
519 struct syscall_arg *arg)
521 int printed = 0, flags = arg->val;
524 if (flags & O_##n) { \
525 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
534 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
539 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
541 #ifndef GRND_NONBLOCK
542 #define GRND_NONBLOCK 0x0001
545 #define GRND_RANDOM 0x0002
548 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
549 struct syscall_arg *arg)
551 int printed = 0, flags = arg->val;
554 if (flags & GRND_##n) { \
555 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
556 flags &= ~GRND_##n; \
564 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
569 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
571 #define STRARRAY(name, array) \
572 { .scnprintf = SCA_STRARRAY, \
573 .parm = &strarray__##array, }
575 #include "trace/beauty/arch_errno_names.c"
576 #include "trace/beauty/eventfd.c"
577 #include "trace/beauty/futex_op.c"
578 #include "trace/beauty/futex_val3.c"
579 #include "trace/beauty/mmap.c"
580 #include "trace/beauty/mode_t.c"
581 #include "trace/beauty/msg_flags.c"
582 #include "trace/beauty/open_flags.c"
583 #include "trace/beauty/perf_event_open.c"
584 #include "trace/beauty/pid.c"
585 #include "trace/beauty/sched_policy.c"
586 #include "trace/beauty/seccomp.c"
587 #include "trace/beauty/signum.c"
588 #include "trace/beauty/socket_type.c"
589 #include "trace/beauty/waitid_options.c"
591 struct syscall_arg_fmt {
592 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
598 static struct syscall_fmt {
601 struct syscall_arg_fmt arg[6];
608 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
610 .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
611 { .name = "brk", .hexret = true,
612 .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
613 { .name = "clock_gettime",
614 .arg = { [0] = STRARRAY(clk_id, clockid), }, },
615 { .name = "clone", .errpid = true, .nr_args = 5,
616 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, },
617 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, },
618 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
619 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, },
620 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, },
622 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
623 { .name = "epoll_ctl",
624 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
625 { .name = "eventfd2",
626 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
627 { .name = "fchmodat",
628 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
629 { .name = "fchownat",
630 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
632 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
633 .parm = &strarrays__fcntl_cmds_arrays,
634 .show_zero = true, },
635 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, },
637 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
638 { .name = "fstat", .alias = "newfstat", },
639 { .name = "fstatat", .alias = "newfstatat", },
641 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
642 [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
643 { .name = "futimesat",
644 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
645 { .name = "getitimer",
646 .arg = { [0] = STRARRAY(which, itimers), }, },
647 { .name = "getpid", .errpid = true, },
648 { .name = "getpgid", .errpid = true, },
649 { .name = "getppid", .errpid = true, },
650 { .name = "getrandom",
651 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
652 { .name = "getrlimit",
653 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
654 { .name = "gettid", .errpid = true, },
657 #if defined(__i386__) || defined(__x86_64__)
659 * FIXME: Make this available to all arches.
661 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
662 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
664 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
666 { .name = "kcmp", .nr_args = 5,
667 .arg = { [0] = { .name = "pid1", .scnprintf = SCA_PID, },
668 [1] = { .name = "pid2", .scnprintf = SCA_PID, },
669 [2] = { .name = "type", .scnprintf = SCA_KCMP_TYPE, },
670 [3] = { .name = "idx1", .scnprintf = SCA_KCMP_IDX, },
671 [4] = { .name = "idx2", .scnprintf = SCA_KCMP_IDX, }, }, },
673 .arg = { [0] = STRARRAY(option, keyctl_options), }, },
675 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
677 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
679 .arg = { [2] = STRARRAY(whence, whences), }, },
680 { .name = "lstat", .alias = "newlstat", },
682 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
683 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
685 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
687 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
689 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
690 { .name = "mlockall",
691 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
692 { .name = "mmap", .hexret = true,
693 /* The standard mmap maps to old_mmap on s390x */
694 #if defined(__s390x__)
697 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
698 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
699 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
700 { .name = "mprotect",
701 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
702 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, },
703 { .name = "mq_unlink",
704 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
705 { .name = "mremap", .hexret = true,
706 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
707 [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
708 [4] = { .scnprintf = SCA_HEX, /* new_addr */ }, }, },
710 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
712 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
713 { .name = "name_to_handle_at",
714 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
715 { .name = "newfstatat",
716 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
718 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
719 { .name = "open_by_handle_at",
720 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
721 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
723 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
724 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
725 { .name = "perf_event_open",
726 .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ },
727 [3] = { .scnprintf = SCA_FD, /* group_fd */ },
728 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
730 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
731 { .name = "pkey_alloc",
732 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, },
733 { .name = "pkey_free",
734 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, },
735 { .name = "pkey_mprotect",
736 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
737 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
738 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, },
739 { .name = "poll", .timeout = true, },
740 { .name = "ppoll", .timeout = true, },
741 { .name = "prctl", .alias = "arch_prctl",
742 .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
743 [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
744 [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
745 { .name = "pread", .alias = "pread64", },
746 { .name = "preadv", .alias = "pread", },
747 { .name = "prlimit64",
748 .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
749 { .name = "pwrite", .alias = "pwrite64", },
750 { .name = "readlinkat",
751 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
752 { .name = "recvfrom",
753 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
754 { .name = "recvmmsg",
755 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
757 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
758 { .name = "renameat",
759 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
760 { .name = "rt_sigaction",
761 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
762 { .name = "rt_sigprocmask",
763 .arg = { [0] = STRARRAY(how, sighow), }, },
764 { .name = "rt_sigqueueinfo",
765 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
766 { .name = "rt_tgsigqueueinfo",
767 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
768 { .name = "sched_setscheduler",
769 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
771 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ },
772 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
773 { .name = "select", .timeout = true, },
774 { .name = "sendmmsg",
775 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
777 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
779 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
780 { .name = "set_tid_address", .errpid = true, },
781 { .name = "setitimer",
782 .arg = { [0] = STRARRAY(which, itimers), }, },
783 { .name = "setrlimit",
784 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
786 .arg = { [0] = STRARRAY(family, socket_families),
787 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
788 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
789 { .name = "socketpair",
790 .arg = { [0] = STRARRAY(family, socket_families),
791 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
792 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
793 { .name = "stat", .alias = "newstat", },
795 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ },
796 [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
797 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, },
799 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
801 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
802 { .name = "symlinkat",
803 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
805 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
807 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
808 { .name = "uname", .alias = "newuname", },
809 { .name = "unlinkat",
810 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
811 { .name = "utimensat",
812 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
813 { .name = "wait4", .errpid = true,
814 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
815 { .name = "waitid", .errpid = true,
816 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
819 static int syscall_fmt__cmp(const void *name, const void *fmtp)
821 const struct syscall_fmt *fmt = fmtp;
822 return strcmp(name, fmt->name);
825 static struct syscall_fmt *syscall_fmt__find(const char *name)
827 const int nmemb = ARRAY_SIZE(syscall_fmts);
828 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
832 * is_exit: is this "exit" or "exit_group"?
833 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
836 struct event_format *tp_format;
840 struct format_field *args;
842 struct syscall_fmt *fmt;
843 struct syscall_arg_fmt *arg_fmt;
847 * We need to have this 'calculated' boolean because in some cases we really
848 * don't know what is the duration of a syscall, for instance, when we start
849 * a session and some threads are waiting for a syscall to finish, say 'poll',
850 * in which case all we can do is to print "( ? ) for duration and for the
853 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
855 double duration = (double)t / NSEC_PER_MSEC;
856 size_t printed = fprintf(fp, "(");
859 printed += fprintf(fp, " ");
860 else if (duration >= 1.0)
861 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
862 else if (duration >= 0.01)
863 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
865 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
866 return printed + fprintf(fp, "): ");
870 * filename.ptr: The filename char pointer that will be vfs_getname'd
871 * filename.entry_str_pos: Where to insert the string translated from
872 * filename.ptr by the vfs_getname tracepoint/kprobe.
873 * ret_scnprintf: syscall args may set this to a different syscall return
874 * formatter, for instance, fcntl may return fds, file flags, etc.
876 struct thread_trace {
879 unsigned long nr_events;
880 unsigned long pfmaj, pfmin;
883 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
886 short int entry_str_pos;
888 unsigned int namelen;
896 struct intlist *syscall_stats;
899 static struct thread_trace *thread_trace__new(void)
901 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
904 ttrace->paths.max = -1;
906 ttrace->syscall_stats = intlist__new(NULL);
911 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
913 struct thread_trace *ttrace;
918 if (thread__priv(thread) == NULL)
919 thread__set_priv(thread, thread_trace__new());
921 if (thread__priv(thread) == NULL)
924 ttrace = thread__priv(thread);
929 color_fprintf(fp, PERF_COLOR_RED,
930 "WARNING: not enough memory, dropping samples!\n");
935 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
936 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
938 struct thread_trace *ttrace = thread__priv(arg->thread);
940 ttrace->ret_scnprintf = ret_scnprintf;
943 #define TRACE_PFMAJ (1 << 0)
944 #define TRACE_PFMIN (1 << 1)
946 static const size_t trace__entry_str_size = 2048;
948 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
950 struct thread_trace *ttrace = thread__priv(thread);
952 if (fd > ttrace->paths.max) {
953 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
958 if (ttrace->paths.max != -1) {
959 memset(npath + ttrace->paths.max + 1, 0,
960 (fd - ttrace->paths.max) * sizeof(char *));
962 memset(npath, 0, (fd + 1) * sizeof(char *));
965 ttrace->paths.table = npath;
966 ttrace->paths.max = fd;
969 ttrace->paths.table[fd] = strdup(pathname);
971 return ttrace->paths.table[fd] != NULL ? 0 : -1;
974 static int thread__read_fd_path(struct thread *thread, int fd)
976 char linkname[PATH_MAX], pathname[PATH_MAX];
980 if (thread->pid_ == thread->tid) {
981 scnprintf(linkname, sizeof(linkname),
982 "/proc/%d/fd/%d", thread->pid_, fd);
984 scnprintf(linkname, sizeof(linkname),
985 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
988 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
991 ret = readlink(linkname, pathname, sizeof(pathname));
993 if (ret < 0 || ret > st.st_size)
996 pathname[ret] = '\0';
997 return trace__set_fd_pathname(thread, fd, pathname);
1000 static const char *thread__fd_path(struct thread *thread, int fd,
1001 struct trace *trace)
1003 struct thread_trace *ttrace = thread__priv(thread);
1011 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1014 ++trace->stats.proc_getname;
1015 if (thread__read_fd_path(thread, fd))
1019 return ttrace->paths.table[fd];
1022 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1025 size_t printed = scnprintf(bf, size, "%d", fd);
1026 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1029 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1034 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1036 size_t printed = scnprintf(bf, size, "%d", fd);
1037 struct thread *thread = machine__find_thread(trace->host, pid, pid);
1040 const char *path = thread__fd_path(thread, fd, trace);
1043 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1045 thread__put(thread);
1051 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1052 struct syscall_arg *arg)
1055 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1056 struct thread_trace *ttrace = thread__priv(arg->thread);
1058 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1059 zfree(&ttrace->paths.table[fd]);
1064 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1067 struct thread_trace *ttrace = thread__priv(thread);
1069 ttrace->filename.ptr = ptr;
1070 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1073 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1074 struct syscall_arg *arg)
1076 unsigned long ptr = arg->val;
1078 if (!arg->trace->vfs_getname)
1079 return scnprintf(bf, size, "%#x", ptr);
1081 thread__set_filename_pos(arg->thread, bf, ptr);
1085 static bool trace__filter_duration(struct trace *trace, double t)
1087 return t < (trace->duration_filter * NSEC_PER_MSEC);
1090 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1092 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1094 return fprintf(fp, "%10.3f ", ts);
1098 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1099 * using ttrace->entry_time for a thread that receives a sys_exit without
1100 * first having received a sys_enter ("poll" issued before tracing session
1101 * starts, lost sys_enter exit due to ring buffer overflow).
1103 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1106 return __trace__fprintf_tstamp(trace, tstamp, fp);
1108 return fprintf(fp, " ? ");
1111 static bool done = false;
1112 static bool interrupted = false;
1114 static void sig_handler(int sig)
1117 interrupted = sig == SIGINT;
1120 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1121 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1123 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1124 printed += fprintf_duration(duration, duration_calculated, fp);
1126 if (trace->multiple_threads) {
1127 if (trace->show_comm)
1128 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1129 printed += fprintf(fp, "%d ", thread->tid);
1135 static int trace__process_event(struct trace *trace, struct machine *machine,
1136 union perf_event *event, struct perf_sample *sample)
1140 switch (event->header.type) {
1141 case PERF_RECORD_LOST:
1142 color_fprintf(trace->output, PERF_COLOR_RED,
1143 "LOST %" PRIu64 " events!\n", event->lost.lost);
1144 ret = machine__process_lost_event(machine, event, sample);
1147 ret = machine__process_event(machine, event, sample);
1154 static int trace__tool_process(struct perf_tool *tool,
1155 union perf_event *event,
1156 struct perf_sample *sample,
1157 struct machine *machine)
1159 struct trace *trace = container_of(tool, struct trace, tool);
1160 return trace__process_event(trace, machine, event, sample);
1163 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1165 struct machine *machine = vmachine;
1167 if (machine->kptr_restrict_warned)
1170 if (symbol_conf.kptr_restrict) {
1171 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1172 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1173 "Kernel samples will not be resolved.\n");
1174 machine->kptr_restrict_warned = true;
1178 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1181 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1183 int err = symbol__init(NULL);
1188 trace->host = machine__new_host();
1189 if (trace->host == NULL)
1192 err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1196 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1197 evlist->threads, trace__tool_process, false,
1198 trace->opts.proc_map_timeout, 1);
1206 static void trace__symbols__exit(struct trace *trace)
1208 machine__exit(trace->host);
1214 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1218 if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1219 nr_args = sc->fmt->nr_args;
1221 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1222 if (sc->arg_fmt == NULL)
1225 for (idx = 0; idx < nr_args; ++idx) {
1227 sc->arg_fmt[idx] = sc->fmt->arg[idx];
1230 sc->nr_args = nr_args;
1234 static int syscall__set_arg_fmts(struct syscall *sc)
1236 struct format_field *field;
1239 for (field = sc->args; field; field = field->next, ++idx) {
1240 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1243 if (strcmp(field->type, "const char *") == 0 &&
1244 (strcmp(field->name, "filename") == 0 ||
1245 strcmp(field->name, "path") == 0 ||
1246 strcmp(field->name, "pathname") == 0))
1247 sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1248 else if (field->flags & FIELD_IS_POINTER)
1249 sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1250 else if (strcmp(field->type, "pid_t") == 0)
1251 sc->arg_fmt[idx].scnprintf = SCA_PID;
1252 else if (strcmp(field->type, "umode_t") == 0)
1253 sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1254 else if ((strcmp(field->type, "int") == 0 ||
1255 strcmp(field->type, "unsigned int") == 0 ||
1256 strcmp(field->type, "long") == 0) &&
1257 (len = strlen(field->name)) >= 2 &&
1258 strcmp(field->name + len - 2, "fd") == 0) {
1260 * /sys/kernel/tracing/events/syscalls/sys_enter*
1261 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1266 sc->arg_fmt[idx].scnprintf = SCA_FD;
1273 static int trace__read_syscall_info(struct trace *trace, int id)
1277 const char *name = syscalltbl__name(trace->sctbl, id);
1282 if (id > trace->syscalls.max) {
1283 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1285 if (nsyscalls == NULL)
1288 if (trace->syscalls.max != -1) {
1289 memset(nsyscalls + trace->syscalls.max + 1, 0,
1290 (id - trace->syscalls.max) * sizeof(*sc));
1292 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1295 trace->syscalls.table = nsyscalls;
1296 trace->syscalls.max = id;
1299 sc = trace->syscalls.table + id;
1302 sc->fmt = syscall_fmt__find(sc->name);
1304 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1305 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1307 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1308 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1309 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1312 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1315 if (IS_ERR(sc->tp_format))
1318 sc->args = sc->tp_format->format.fields;
1320 * We need to check and discard the first variable '__syscall_nr'
1321 * or 'nr' that mean the syscall number. It is needless here.
1322 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1324 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1325 sc->args = sc->args->next;
1329 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1330 sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1332 return syscall__set_arg_fmts(sc);
1335 static int trace__validate_ev_qualifier(struct trace *trace)
1338 size_t nr_allocated;
1339 struct str_node *pos;
1341 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1342 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1343 sizeof(trace->ev_qualifier_ids.entries[0]));
1345 if (trace->ev_qualifier_ids.entries == NULL) {
1346 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1352 nr_allocated = trace->ev_qualifier_ids.nr;
1355 strlist__for_each_entry(pos, trace->ev_qualifier) {
1356 const char *sc = pos->s;
1357 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1360 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1365 fputs("Error:\tInvalid syscall ", trace->output);
1368 fputs(", ", trace->output);
1371 fputs(sc, trace->output);
1374 trace->ev_qualifier_ids.entries[i++] = id;
1375 if (match_next == -1)
1379 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1382 if (nr_allocated == trace->ev_qualifier_ids.nr) {
1386 entries = realloc(trace->ev_qualifier_ids.entries,
1387 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1388 if (entries == NULL) {
1390 fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1393 trace->ev_qualifier_ids.entries = entries;
1395 trace->ev_qualifier_ids.nr++;
1396 trace->ev_qualifier_ids.entries[i++] = id;
1401 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1402 "\nHint:\tand: 'man syscalls'\n", trace->output);
1404 zfree(&trace->ev_qualifier_ids.entries);
1405 trace->ev_qualifier_ids.nr = 0;
1412 * args is to be interpreted as a series of longs but we need to handle
1413 * 8-byte unaligned accesses. args points to raw_data within the event
1414 * and raw_data is guaranteed to be 8-byte unaligned because it is
1415 * preceded by raw_size which is a u32. So we need to copy args to a temp
1416 * variable to read it. Most notably this avoids extended load instructions
1417 * on unaligned addresses
1419 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1422 unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1424 memcpy(&val, p, sizeof(val));
1428 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1429 struct syscall_arg *arg)
1431 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1432 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1434 return scnprintf(bf, size, "arg%d: ", arg->idx);
1437 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1438 struct syscall_arg *arg, unsigned long val)
1440 if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1442 if (sc->arg_fmt[arg->idx].parm)
1443 arg->parm = sc->arg_fmt[arg->idx].parm;
1444 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1446 return scnprintf(bf, size, "%ld", val);
1449 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1450 unsigned char *args, struct trace *trace,
1451 struct thread *thread)
1456 struct syscall_arg arg = {
1463 struct thread_trace *ttrace = thread__priv(thread);
1466 * Things like fcntl will set this in its 'cmd' formatter to pick the
1467 * right formatter for the return value (an fd? file flags?), which is
1468 * not needed for syscalls that always return a given type, say an fd.
1470 ttrace->ret_scnprintf = NULL;
1472 if (sc->args != NULL) {
1473 struct format_field *field;
1475 for (field = sc->args; field;
1476 field = field->next, ++arg.idx, bit <<= 1) {
1480 val = syscall_arg__val(&arg, arg.idx);
1483 * Suppress this argument if its value is zero and
1484 * and we don't have a string associated in an
1489 (sc->arg_fmt[arg.idx].show_zero ||
1490 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1491 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1492 sc->arg_fmt[arg.idx].parm))
1495 printed += scnprintf(bf + printed, size - printed,
1496 "%s%s: ", printed ? ", " : "", field->name);
1497 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1499 } else if (IS_ERR(sc->tp_format)) {
1501 * If we managed to read the tracepoint /format file, then we
1502 * may end up not having any args, like with gettid(), so only
1503 * print the raw args when we didn't manage to read it.
1505 while (arg.idx < sc->nr_args) {
1508 val = syscall_arg__val(&arg, arg.idx);
1510 printed += scnprintf(bf + printed, size - printed, ", ");
1511 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1512 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1522 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1523 union perf_event *event,
1524 struct perf_sample *sample);
1526 static struct syscall *trace__syscall_info(struct trace *trace,
1527 struct perf_evsel *evsel, int id)
1533 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1534 * before that, leaving at a higher verbosity level till that is
1535 * explained. Reproduced with plain ftrace with:
1537 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1538 * grep "NR -1 " /t/trace_pipe
1540 * After generating some load on the machine.
1544 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1545 id, perf_evsel__name(evsel), ++n);
1550 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1551 trace__read_syscall_info(trace, id))
1554 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1557 return &trace->syscalls.table[id];
1561 fprintf(trace->output, "Problems reading syscall %d", id);
1562 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1563 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1564 fputs(" information\n", trace->output);
1569 static void thread__update_stats(struct thread_trace *ttrace,
1570 int id, struct perf_sample *sample)
1572 struct int_node *inode;
1573 struct stats *stats;
1576 inode = intlist__findnew(ttrace->syscall_stats, id);
1580 stats = inode->priv;
1581 if (stats == NULL) {
1582 stats = malloc(sizeof(struct stats));
1586 inode->priv = stats;
1589 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1590 duration = sample->time - ttrace->entry_time;
1592 update_stats(stats, duration);
1595 static int trace__printf_interrupted_entry(struct trace *trace)
1597 struct thread_trace *ttrace;
1600 if (trace->failure_only || trace->current == NULL)
1603 ttrace = thread__priv(trace->current);
1605 if (!ttrace->entry_pending)
1608 printed = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1609 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1610 ttrace->entry_pending = false;
1615 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1616 struct perf_sample *sample, struct thread *thread)
1620 if (trace->print_sample) {
1621 double ts = (double)sample->time / NSEC_PER_MSEC;
1623 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1624 perf_evsel__name(evsel), ts,
1625 thread__comm_str(thread),
1626 sample->pid, sample->tid, sample->cpu);
1632 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1633 union perf_event *event __maybe_unused,
1634 struct perf_sample *sample)
1639 struct thread *thread;
1640 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1641 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1642 struct thread_trace *ttrace;
1647 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1648 ttrace = thread__trace(thread, trace->output);
1652 trace__fprintf_sample(trace, evsel, sample, thread);
1654 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1656 if (ttrace->entry_str == NULL) {
1657 ttrace->entry_str = malloc(trace__entry_str_size);
1658 if (!ttrace->entry_str)
1662 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1663 trace__printf_interrupted_entry(trace);
1665 ttrace->entry_time = sample->time;
1666 msg = ttrace->entry_str;
1667 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1669 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1670 args, trace, thread);
1673 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1674 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1675 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1678 ttrace->entry_pending = true;
1679 /* See trace__vfs_getname & trace__sys_exit */
1680 ttrace->filename.pending_open = false;
1683 if (trace->current != thread) {
1684 thread__put(trace->current);
1685 trace->current = thread__get(thread);
1689 thread__put(thread);
1693 static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel,
1694 struct perf_sample *sample)
1696 struct thread_trace *ttrace;
1697 struct thread *thread;
1698 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1699 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1706 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1707 ttrace = thread__trace(thread, trace->output);
1709 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
1710 * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
1715 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1716 syscall__scnprintf_args(sc, msg, sizeof(msg), args, trace, thread);
1717 fprintf(trace->output, "%s", msg);
1720 thread__put(thread);
1724 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1725 struct perf_sample *sample,
1726 struct callchain_cursor *cursor)
1728 struct addr_location al;
1729 int max_stack = evsel->attr.sample_max_stack ?
1730 evsel->attr.sample_max_stack :
1733 if (machine__resolve(trace->host, &al, sample) < 0 ||
1734 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1740 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1742 /* TODO: user-configurable print_opts */
1743 const unsigned int print_opts = EVSEL__PRINT_SYM |
1745 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1747 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1750 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1752 struct perf_env *env = perf_evsel__env(evsel);
1753 const char *arch_name = perf_env__arch(env);
1755 return arch_syscalls__strerrno(arch_name, err);
1758 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1759 union perf_event *event __maybe_unused,
1760 struct perf_sample *sample)
1764 bool duration_calculated = false;
1765 struct thread *thread;
1766 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1767 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1768 struct thread_trace *ttrace;
1773 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1774 ttrace = thread__trace(thread, trace->output);
1778 trace__fprintf_sample(trace, evsel, sample, thread);
1781 thread__update_stats(ttrace, id, sample);
1783 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1785 if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
1786 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1787 ttrace->filename.pending_open = false;
1788 ++trace->stats.vfs_getname;
1791 if (ttrace->entry_time) {
1792 duration = sample->time - ttrace->entry_time;
1793 if (trace__filter_duration(trace, duration))
1795 duration_calculated = true;
1796 } else if (trace->duration_filter)
1799 if (sample->callchain) {
1800 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1801 if (callchain_ret == 0) {
1802 if (callchain_cursor.nr < trace->min_stack)
1808 if (trace->summary_only || (ret >= 0 && trace->failure_only))
1811 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1813 if (ttrace->entry_pending) {
1814 fprintf(trace->output, "%-70s", ttrace->entry_str);
1816 fprintf(trace->output, " ... [");
1817 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1818 fprintf(trace->output, "]: %s()", sc->name);
1821 if (sc->fmt == NULL) {
1825 fprintf(trace->output, ") = %ld", ret);
1826 } else if (ret < 0) {
1828 char bf[STRERR_BUFSIZE];
1829 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1830 *e = errno_to_name(evsel, -ret);
1832 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1834 } else if (ret == 0 && sc->fmt->timeout)
1835 fprintf(trace->output, ") = 0 Timeout");
1836 else if (ttrace->ret_scnprintf) {
1838 struct syscall_arg arg = {
1843 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1844 ttrace->ret_scnprintf = NULL;
1845 fprintf(trace->output, ") = %s", bf);
1846 } else if (sc->fmt->hexret)
1847 fprintf(trace->output, ") = %#lx", ret);
1848 else if (sc->fmt->errpid) {
1849 struct thread *child = machine__find_thread(trace->host, ret, ret);
1851 if (child != NULL) {
1852 fprintf(trace->output, ") = %ld", ret);
1853 if (child->comm_set)
1854 fprintf(trace->output, " (%s)", thread__comm_str(child));
1860 fputc('\n', trace->output);
1862 if (callchain_ret > 0)
1863 trace__fprintf_callchain(trace, sample);
1864 else if (callchain_ret < 0)
1865 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1867 ttrace->entry_pending = false;
1870 thread__put(thread);
1874 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1875 union perf_event *event __maybe_unused,
1876 struct perf_sample *sample)
1878 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1879 struct thread_trace *ttrace;
1880 size_t filename_len, entry_str_len, to_move;
1881 ssize_t remaining_space;
1883 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1888 ttrace = thread__priv(thread);
1892 filename_len = strlen(filename);
1893 if (filename_len == 0)
1896 if (ttrace->filename.namelen < filename_len) {
1897 char *f = realloc(ttrace->filename.name, filename_len + 1);
1902 ttrace->filename.namelen = filename_len;
1903 ttrace->filename.name = f;
1906 strcpy(ttrace->filename.name, filename);
1907 ttrace->filename.pending_open = true;
1909 if (!ttrace->filename.ptr)
1912 entry_str_len = strlen(ttrace->entry_str);
1913 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1914 if (remaining_space <= 0)
1917 if (filename_len > (size_t)remaining_space) {
1918 filename += filename_len - remaining_space;
1919 filename_len = remaining_space;
1922 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1923 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1924 memmove(pos + filename_len, pos, to_move);
1925 memcpy(pos, filename, filename_len);
1927 ttrace->filename.ptr = 0;
1928 ttrace->filename.entry_str_pos = 0;
1930 thread__put(thread);
1935 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1936 union perf_event *event __maybe_unused,
1937 struct perf_sample *sample)
1939 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1940 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1941 struct thread *thread = machine__findnew_thread(trace->host,
1944 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1949 ttrace->runtime_ms += runtime_ms;
1950 trace->runtime_ms += runtime_ms;
1952 thread__put(thread);
1956 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1958 perf_evsel__strval(evsel, sample, "comm"),
1959 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1961 perf_evsel__intval(evsel, sample, "vruntime"));
1965 static int bpf_output__printer(enum binary_printer_ops op,
1966 unsigned int val, void *extra __maybe_unused, FILE *fp)
1968 unsigned char ch = (unsigned char)val;
1971 case BINARY_PRINT_CHAR_DATA:
1972 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1973 case BINARY_PRINT_DATA_BEGIN:
1974 case BINARY_PRINT_LINE_BEGIN:
1975 case BINARY_PRINT_ADDR:
1976 case BINARY_PRINT_NUM_DATA:
1977 case BINARY_PRINT_NUM_PAD:
1978 case BINARY_PRINT_SEP:
1979 case BINARY_PRINT_CHAR_PAD:
1980 case BINARY_PRINT_LINE_END:
1981 case BINARY_PRINT_DATA_END:
1989 static void bpf_output__fprintf(struct trace *trace,
1990 struct perf_sample *sample)
1992 binary__fprintf(sample->raw_data, sample->raw_size, 8,
1993 bpf_output__printer, NULL, trace->output);
1996 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1997 union perf_event *event __maybe_unused,
1998 struct perf_sample *sample)
2000 int callchain_ret = 0;
2002 if (sample->callchain) {
2003 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2004 if (callchain_ret == 0) {
2005 if (callchain_cursor.nr < trace->min_stack)
2011 trace__printf_interrupted_entry(trace);
2012 trace__fprintf_tstamp(trace, sample->time, trace->output);
2014 if (trace->trace_syscalls)
2015 fprintf(trace->output, "( ): ");
2017 fprintf(trace->output, "%s:", evsel->name);
2019 if (perf_evsel__is_bpf_output(evsel)) {
2020 bpf_output__fprintf(trace, sample);
2021 } else if (evsel->tp_format) {
2022 if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
2023 trace__fprintf_sys_enter(trace, evsel, sample)) {
2024 event_format__fprintf(evsel->tp_format, sample->cpu,
2025 sample->raw_data, sample->raw_size,
2030 fprintf(trace->output, "\n");
2032 if (callchain_ret > 0)
2033 trace__fprintf_callchain(trace, sample);
2034 else if (callchain_ret < 0)
2035 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2040 static void print_location(FILE *f, struct perf_sample *sample,
2041 struct addr_location *al,
2042 bool print_dso, bool print_sym)
2045 if ((verbose > 0 || print_dso) && al->map)
2046 fprintf(f, "%s@", al->map->dso->long_name);
2048 if ((verbose > 0 || print_sym) && al->sym)
2049 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2050 al->addr - al->sym->start);
2052 fprintf(f, "0x%" PRIx64, al->addr);
2054 fprintf(f, "0x%" PRIx64, sample->addr);
2057 static int trace__pgfault(struct trace *trace,
2058 struct perf_evsel *evsel,
2059 union perf_event *event __maybe_unused,
2060 struct perf_sample *sample)
2062 struct thread *thread;
2063 struct addr_location al;
2064 char map_type = 'd';
2065 struct thread_trace *ttrace;
2067 int callchain_ret = 0;
2069 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2071 if (sample->callchain) {
2072 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2073 if (callchain_ret == 0) {
2074 if (callchain_cursor.nr < trace->min_stack)
2080 ttrace = thread__trace(thread, trace->output);
2084 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2089 if (trace->summary_only)
2092 thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2094 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2096 fprintf(trace->output, "%sfault [",
2097 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2100 print_location(trace->output, sample, &al, false, true);
2102 fprintf(trace->output, "] => ");
2104 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2107 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2115 print_location(trace->output, sample, &al, true, false);
2117 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2119 if (callchain_ret > 0)
2120 trace__fprintf_callchain(trace, sample);
2121 else if (callchain_ret < 0)
2122 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2126 thread__put(thread);
2130 static void trace__set_base_time(struct trace *trace,
2131 struct perf_evsel *evsel,
2132 struct perf_sample *sample)
2135 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2136 * and don't use sample->time unconditionally, we may end up having
2137 * some other event in the future without PERF_SAMPLE_TIME for good
2138 * reason, i.e. we may not be interested in its timestamps, just in
2139 * it taking place, picking some piece of information when it
2140 * appears in our event stream (vfs_getname comes to mind).
2142 if (trace->base_time == 0 && !trace->full_time &&
2143 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2144 trace->base_time = sample->time;
2147 static int trace__process_sample(struct perf_tool *tool,
2148 union perf_event *event,
2149 struct perf_sample *sample,
2150 struct perf_evsel *evsel,
2151 struct machine *machine __maybe_unused)
2153 struct trace *trace = container_of(tool, struct trace, tool);
2154 struct thread *thread;
2157 tracepoint_handler handler = evsel->handler;
2159 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2160 if (thread && thread__is_filtered(thread))
2163 trace__set_base_time(trace, evsel, sample);
2167 handler(trace, evsel, event, sample);
2170 thread__put(thread);
2174 static int trace__record(struct trace *trace, int argc, const char **argv)
2176 unsigned int rec_argc, i, j;
2177 const char **rec_argv;
2178 const char * const record_args[] = {
2185 const char * const sc_args[] = { "-e", };
2186 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2187 const char * const majpf_args[] = { "-e", "major-faults" };
2188 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2189 const char * const minpf_args[] = { "-e", "minor-faults" };
2190 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2192 /* +1 is for the event string below */
2193 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2194 majpf_args_nr + minpf_args_nr + argc;
2195 rec_argv = calloc(rec_argc + 1, sizeof(char *));
2197 if (rec_argv == NULL)
2201 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2202 rec_argv[j++] = record_args[i];
2204 if (trace->trace_syscalls) {
2205 for (i = 0; i < sc_args_nr; i++)
2206 rec_argv[j++] = sc_args[i];
2208 /* event string may be different for older kernels - e.g., RHEL6 */
2209 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2210 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2211 else if (is_valid_tracepoint("syscalls:sys_enter"))
2212 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2214 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2220 if (trace->trace_pgfaults & TRACE_PFMAJ)
2221 for (i = 0; i < majpf_args_nr; i++)
2222 rec_argv[j++] = majpf_args[i];
2224 if (trace->trace_pgfaults & TRACE_PFMIN)
2225 for (i = 0; i < minpf_args_nr; i++)
2226 rec_argv[j++] = minpf_args[i];
2228 for (i = 0; i < (unsigned int)argc; i++)
2229 rec_argv[j++] = argv[i];
2231 return cmd_record(j, rec_argv);
2234 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2236 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2238 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2243 if (perf_evsel__field(evsel, "pathname") == NULL) {
2244 perf_evsel__delete(evsel);
2248 evsel->handler = trace__vfs_getname;
2249 perf_evlist__add(evlist, evsel);
2253 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2255 struct perf_evsel *evsel;
2256 struct perf_event_attr attr = {
2257 .type = PERF_TYPE_SOFTWARE,
2261 attr.config = config;
2262 attr.sample_period = 1;
2264 event_attr_init(&attr);
2266 evsel = perf_evsel__new(&attr);
2268 evsel->handler = trace__pgfault;
2273 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2275 const u32 type = event->header.type;
2276 struct perf_evsel *evsel;
2278 if (type != PERF_RECORD_SAMPLE) {
2279 trace__process_event(trace, trace->host, event, sample);
2283 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2284 if (evsel == NULL) {
2285 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2289 trace__set_base_time(trace, evsel, sample);
2291 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2292 sample->raw_data == NULL) {
2293 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2294 perf_evsel__name(evsel), sample->tid,
2295 sample->cpu, sample->raw_size);
2297 tracepoint_handler handler = evsel->handler;
2298 handler(trace, evsel, event, sample);
2302 static int trace__add_syscall_newtp(struct trace *trace)
2305 struct perf_evlist *evlist = trace->evlist;
2306 struct perf_evsel *sys_enter, *sys_exit;
2308 sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
2309 if (sys_enter == NULL)
2312 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2313 goto out_delete_sys_enter;
2315 sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
2316 if (sys_exit == NULL)
2317 goto out_delete_sys_enter;
2319 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2320 goto out_delete_sys_exit;
2322 perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2323 perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2325 perf_evlist__add(evlist, sys_enter);
2326 perf_evlist__add(evlist, sys_exit);
2328 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2330 * We're interested only in the user space callchain
2331 * leading to the syscall, allow overriding that for
2332 * debugging reasons using --kernel_syscall_callchains
2334 sys_exit->attr.exclude_callchain_kernel = 1;
2337 trace->syscalls.events.sys_enter = sys_enter;
2338 trace->syscalls.events.sys_exit = sys_exit;
2344 out_delete_sys_exit:
2345 perf_evsel__delete_priv(sys_exit);
2346 out_delete_sys_enter:
2347 perf_evsel__delete_priv(sys_enter);
2351 static int trace__set_ev_qualifier_filter(struct trace *trace)
2354 struct perf_evsel *sys_exit;
2355 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2356 trace->ev_qualifier_ids.nr,
2357 trace->ev_qualifier_ids.entries);
2362 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2364 sys_exit = trace->syscalls.events.sys_exit;
2365 err = perf_evsel__append_tp_filter(sys_exit, filter);
2376 static int trace__set_filter_loop_pids(struct trace *trace)
2378 unsigned int nr = 1;
2382 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2384 while (thread && nr < ARRAY_SIZE(pids)) {
2385 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2390 if (!strcmp(thread__comm_str(parent), "sshd")) {
2391 pids[nr++] = parent->tid;
2397 return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2400 static int trace__run(struct trace *trace, int argc, const char **argv)
2402 struct perf_evlist *evlist = trace->evlist;
2403 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2405 unsigned long before;
2406 const bool forks = argc > 0;
2407 bool draining = false;
2411 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2412 goto out_error_raw_syscalls;
2414 if (trace->trace_syscalls)
2415 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2417 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2418 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2419 if (pgfault_maj == NULL)
2421 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2422 perf_evlist__add(evlist, pgfault_maj);
2425 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2426 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2427 if (pgfault_min == NULL)
2429 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2430 perf_evlist__add(evlist, pgfault_min);
2434 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2435 trace__sched_stat_runtime))
2436 goto out_error_sched_stat_runtime;
2439 * If a global cgroup was set, apply it to all the events without an
2440 * explicit cgroup. I.e.:
2442 * trace -G A -e sched:*switch
2444 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2445 * _and_ sched:sched_switch to the 'A' cgroup, while:
2447 * trace -e sched:*switch -G A
2449 * will only set the sched:sched_switch event to the 'A' cgroup, all the
2450 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2451 * a cgroup (on the root cgroup, sys wide, etc).
2455 * trace -G A -e sched:*switch -G B
2457 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2458 * to the 'B' cgroup.
2460 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2461 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2464 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2466 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2468 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2469 goto out_delete_evlist;
2472 err = trace__symbols_init(trace, evlist);
2474 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2475 goto out_delete_evlist;
2478 perf_evlist__config(evlist, &trace->opts, &callchain_param);
2480 signal(SIGCHLD, sig_handler);
2481 signal(SIGINT, sig_handler);
2484 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2487 fprintf(trace->output, "Couldn't run the workload!\n");
2488 goto out_delete_evlist;
2492 err = perf_evlist__open(evlist);
2494 goto out_error_open;
2496 err = bpf__apply_obj_config();
2498 char errbuf[BUFSIZ];
2500 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2501 pr_err("ERROR: Apply config to BPF failed: %s\n",
2503 goto out_error_open;
2507 * Better not use !target__has_task() here because we need to cover the
2508 * case where no threads were specified in the command line, but a
2509 * workload was, and in that case we will fill in the thread_map when
2510 * we fork the workload in perf_evlist__prepare_workload.
2512 if (trace->filter_pids.nr > 0)
2513 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2514 else if (thread_map__pid(evlist->threads, 0) == -1)
2515 err = trace__set_filter_loop_pids(trace);
2520 if (trace->ev_qualifier_ids.nr > 0) {
2521 err = trace__set_ev_qualifier_filter(trace);
2525 pr_debug("event qualifier tracepoint filter: %s\n",
2526 trace->syscalls.events.sys_exit->filter);
2529 err = perf_evlist__apply_filters(evlist, &evsel);
2531 goto out_error_apply_filters;
2533 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2535 goto out_error_mmap;
2537 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2538 perf_evlist__enable(evlist);
2541 perf_evlist__start_workload(evlist);
2543 if (trace->opts.initial_delay) {
2544 usleep(trace->opts.initial_delay * 1000);
2545 perf_evlist__enable(evlist);
2548 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2549 evlist->threads->nr > 1 ||
2550 perf_evlist__first(evlist)->attr.inherit;
2553 * Now that we already used evsel->attr to ask the kernel to setup the
2554 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2555 * trace__resolve_callchain(), allowing per-event max-stack settings
2556 * to override an explicitely set --max-stack global setting.
2558 evlist__for_each_entry(evlist, evsel) {
2559 if (evsel__has_callchain(evsel) &&
2560 evsel->attr.sample_max_stack == 0)
2561 evsel->attr.sample_max_stack = trace->max_stack;
2564 before = trace->nr_events;
2566 for (i = 0; i < evlist->nr_mmaps; i++) {
2567 union perf_event *event;
2568 struct perf_mmap *md;
2570 md = &evlist->mmap[i];
2571 if (perf_mmap__read_init(md) < 0)
2574 while ((event = perf_mmap__read_event(md)) != NULL) {
2575 struct perf_sample sample;
2579 err = perf_evlist__parse_sample(evlist, event, &sample);
2581 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2585 trace__handle_event(trace, event, &sample);
2587 perf_mmap__consume(md);
2592 if (done && !draining) {
2593 perf_evlist__disable(evlist);
2597 perf_mmap__read_done(md);
2600 if (trace->nr_events == before) {
2601 int timeout = done ? 100 : -1;
2603 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2604 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2614 thread__zput(trace->current);
2616 perf_evlist__disable(evlist);
2620 trace__fprintf_thread_summary(trace, trace->output);
2622 if (trace->show_tool_stats) {
2623 fprintf(trace->output, "Stats:\n "
2624 " vfs_getname : %" PRIu64 "\n"
2625 " proc_getname: %" PRIu64 "\n",
2626 trace->stats.vfs_getname,
2627 trace->stats.proc_getname);
2632 trace__symbols__exit(trace);
2634 perf_evlist__delete(evlist);
2635 cgroup__put(trace->cgroup);
2636 trace->evlist = NULL;
2637 trace->live = false;
2640 char errbuf[BUFSIZ];
2642 out_error_sched_stat_runtime:
2643 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2646 out_error_raw_syscalls:
2647 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2651 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2655 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2658 fprintf(trace->output, "%s\n", errbuf);
2659 goto out_delete_evlist;
2661 out_error_apply_filters:
2662 fprintf(trace->output,
2663 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2664 evsel->filter, perf_evsel__name(evsel), errno,
2665 str_error_r(errno, errbuf, sizeof(errbuf)));
2666 goto out_delete_evlist;
2669 fprintf(trace->output, "Not enough memory to run!\n");
2670 goto out_delete_evlist;
2673 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2674 goto out_delete_evlist;
2677 static int trace__replay(struct trace *trace)
2679 const struct perf_evsel_str_handler handlers[] = {
2680 { "probe:vfs_getname", trace__vfs_getname, },
2682 struct perf_data data = {
2686 .mode = PERF_DATA_MODE_READ,
2687 .force = trace->force,
2689 struct perf_session *session;
2690 struct perf_evsel *evsel;
2693 trace->tool.sample = trace__process_sample;
2694 trace->tool.mmap = perf_event__process_mmap;
2695 trace->tool.mmap2 = perf_event__process_mmap2;
2696 trace->tool.comm = perf_event__process_comm;
2697 trace->tool.exit = perf_event__process_exit;
2698 trace->tool.fork = perf_event__process_fork;
2699 trace->tool.attr = perf_event__process_attr;
2700 trace->tool.tracing_data = perf_event__process_tracing_data;
2701 trace->tool.build_id = perf_event__process_build_id;
2702 trace->tool.namespaces = perf_event__process_namespaces;
2704 trace->tool.ordered_events = true;
2705 trace->tool.ordering_requires_timestamps = true;
2707 /* add tid to output */
2708 trace->multiple_threads = true;
2710 session = perf_session__new(&data, false, &trace->tool);
2711 if (session == NULL)
2714 if (trace->opts.target.pid)
2715 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2717 if (trace->opts.target.tid)
2718 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2720 if (symbol__init(&session->header.env) < 0)
2723 trace->host = &session->machines.host;
2725 err = perf_session__set_tracepoints_handlers(session, handlers);
2729 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2730 "raw_syscalls:sys_enter");
2731 /* older kernels have syscalls tp versus raw_syscalls */
2733 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2734 "syscalls:sys_enter");
2737 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
2738 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2739 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2743 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2744 "raw_syscalls:sys_exit");
2746 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2747 "syscalls:sys_exit");
2749 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
2750 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2751 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2755 evlist__for_each_entry(session->evlist, evsel) {
2756 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2757 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2758 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2759 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2760 evsel->handler = trace__pgfault;
2765 err = perf_session__process_events(session);
2767 pr_err("Failed to process events, error %d", err);
2769 else if (trace->summary)
2770 trace__fprintf_thread_summary(trace, trace->output);
2773 perf_session__delete(session);
2778 static size_t trace__fprintf_threads_header(FILE *fp)
2782 printed = fprintf(fp, "\n Summary of events:\n\n");
2787 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2788 struct stats *stats;
2793 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2794 struct stats *stats = source->priv;
2796 entry->syscall = source->i;
2797 entry->stats = stats;
2798 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2801 static size_t thread__dump_stats(struct thread_trace *ttrace,
2802 struct trace *trace, FILE *fp)
2807 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2809 if (syscall_stats == NULL)
2812 printed += fprintf(fp, "\n");
2814 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2815 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2816 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2818 resort_rb__for_each_entry(nd, syscall_stats) {
2819 struct stats *stats = syscall_stats_entry->stats;
2821 double min = (double)(stats->min) / NSEC_PER_MSEC;
2822 double max = (double)(stats->max) / NSEC_PER_MSEC;
2823 double avg = avg_stats(stats);
2825 u64 n = (u64) stats->n;
2827 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2828 avg /= NSEC_PER_MSEC;
2830 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2831 printed += fprintf(fp, " %-15s", sc->name);
2832 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2833 n, syscall_stats_entry->msecs, min, avg);
2834 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2838 resort_rb__delete(syscall_stats);
2839 printed += fprintf(fp, "\n\n");
2844 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2847 struct thread_trace *ttrace = thread__priv(thread);
2853 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2855 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2856 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2857 printed += fprintf(fp, "%.1f%%", ratio);
2859 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2861 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2863 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2864 else if (fputc('\n', fp) != EOF)
2867 printed += thread__dump_stats(ttrace, trace, fp);
2872 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2874 return ttrace ? ttrace->nr_events : 0;
2877 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2878 struct thread *thread;
2881 entry->thread = rb_entry(nd, struct thread, rb_node);
2884 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2886 size_t printed = trace__fprintf_threads_header(fp);
2890 for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2891 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2893 if (threads == NULL) {
2894 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2898 resort_rb__for_each_entry(nd, threads)
2899 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2901 resort_rb__delete(threads);
2906 static int trace__set_duration(const struct option *opt, const char *str,
2907 int unset __maybe_unused)
2909 struct trace *trace = opt->value;
2911 trace->duration_filter = atof(str);
2915 static int trace__set_filter_pids(const struct option *opt, const char *str,
2916 int unset __maybe_unused)
2920 struct trace *trace = opt->value;
2922 * FIXME: introduce a intarray class, plain parse csv and create a
2923 * { int nr, int entries[] } struct...
2925 struct intlist *list = intlist__new(str);
2930 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2931 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2933 if (trace->filter_pids.entries == NULL)
2936 trace->filter_pids.entries[0] = getpid();
2938 for (i = 1; i < trace->filter_pids.nr; ++i)
2939 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2941 intlist__delete(list);
2947 static int trace__open_output(struct trace *trace, const char *filename)
2951 if (!stat(filename, &st) && st.st_size) {
2952 char oldname[PATH_MAX];
2954 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2956 rename(filename, oldname);
2959 trace->output = fopen(filename, "w");
2961 return trace->output == NULL ? -errno : 0;
2964 static int parse_pagefaults(const struct option *opt, const char *str,
2965 int unset __maybe_unused)
2967 int *trace_pgfaults = opt->value;
2969 if (strcmp(str, "all") == 0)
2970 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2971 else if (strcmp(str, "maj") == 0)
2972 *trace_pgfaults |= TRACE_PFMAJ;
2973 else if (strcmp(str, "min") == 0)
2974 *trace_pgfaults |= TRACE_PFMIN;
2981 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2983 struct perf_evsel *evsel;
2985 evlist__for_each_entry(evlist, evsel)
2986 evsel->handler = handler;
2989 static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist)
2991 struct perf_evsel *evsel;
2993 evlist__for_each_entry(evlist, evsel) {
2994 if (evsel->priv || !evsel->tp_format)
2997 if (strcmp(evsel->tp_format->system, "syscalls"))
3000 if (perf_evsel__init_syscall_tp(evsel))
3003 if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
3004 struct syscall_tp *sc = evsel->priv;
3006 if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
3008 } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
3009 struct syscall_tp *sc = evsel->priv;
3011 if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
3020 * XXX: Hackish, just splitting the combined -e+--event (syscalls
3021 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
3022 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
3024 * It'd be better to introduce a parse_options() variant that would return a
3025 * list with the terms it didn't match to an event...
3027 static int trace__parse_events_option(const struct option *opt, const char *str,
3028 int unset __maybe_unused)
3030 struct trace *trace = (struct trace *)opt->value;
3031 const char *s = str;
3032 char *sep = NULL, *lists[2] = { NULL, NULL, };
3033 int len = strlen(str) + 1, err = -1, list, idx;
3034 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
3035 char group_name[PATH_MAX];
3037 if (strace_groups_dir == NULL)
3042 trace->not_ev_qualifier = true;
3046 if ((sep = strchr(s, ',')) != NULL)
3050 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
3051 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
3054 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
3055 if (access(group_name, R_OK) == 0)
3060 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
3062 lists[list] = malloc(len);
3063 if (lists[list] == NULL)
3065 strcpy(lists[list], s);
3075 if (lists[1] != NULL) {
3076 struct strlist_config slist_config = {
3077 .dirname = strace_groups_dir,
3080 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
3081 if (trace->ev_qualifier == NULL) {
3082 fputs("Not enough memory to parse event qualifier", trace->output);
3086 if (trace__validate_ev_qualifier(trace))
3088 trace->trace_syscalls = true;
3094 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3095 "event selector. use 'perf list' to list available events",
3096 parse_events_option);
3097 err = parse_events_option(&o, lists[0], 0);
3106 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3108 struct trace *trace = opt->value;
3110 if (!list_empty(&trace->evlist->entries))
3111 return parse_cgroups(opt, str, unset);
3113 trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3118 int cmd_trace(int argc, const char **argv)
3120 const char *trace_usage[] = {
3121 "perf trace [<options>] [<command>]",
3122 "perf trace [<options>] -- <command> [<options>]",
3123 "perf trace record [<options>] [<command>]",
3124 "perf trace record [<options>] -- <command> [<options>]",
3127 struct trace trace = {
3136 .user_freq = UINT_MAX,
3137 .user_interval = ULLONG_MAX,
3138 .no_buffering = true,
3139 .mmap_pages = UINT_MAX,
3140 .proc_map_timeout = 500,
3144 .trace_syscalls = false,
3145 .kernel_syscallchains = false,
3146 .max_stack = UINT_MAX,
3148 const char *output_name = NULL;
3149 const struct option trace_options[] = {
3150 OPT_CALLBACK('e', "event", &trace, "event",
3151 "event/syscall selector. use 'perf list' to list available events",
3152 trace__parse_events_option),
3153 OPT_BOOLEAN(0, "comm", &trace.show_comm,
3154 "show the thread COMM next to its id"),
3155 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3156 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3157 trace__parse_events_option),
3158 OPT_STRING('o', "output", &output_name, "file", "output file name"),
3159 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3160 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3161 "trace events on existing process id"),
3162 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3163 "trace events on existing thread id"),
3164 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3165 "pids to filter (by the kernel)", trace__set_filter_pids),
3166 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3167 "system-wide collection from all CPUs"),
3168 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3169 "list of cpus to monitor"),
3170 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3171 "child tasks do not inherit counters"),
3172 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3173 "number of mmap data pages",
3174 perf_evlist__parse_mmap_pages),
3175 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3177 OPT_CALLBACK(0, "duration", &trace, "float",
3178 "show only events with duration > N.M ms",
3179 trace__set_duration),
3180 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3181 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3182 OPT_BOOLEAN('T', "time", &trace.full_time,
3183 "Show full timestamp, not time relative to first start"),
3184 OPT_BOOLEAN(0, "failure", &trace.failure_only,
3185 "Show only syscalls that failed"),
3186 OPT_BOOLEAN('s', "summary", &trace.summary_only,
3187 "Show only syscall summary with statistics"),
3188 OPT_BOOLEAN('S', "with-summary", &trace.summary,
3189 "Show all syscalls and summary with statistics"),
3190 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3191 "Trace pagefaults", parse_pagefaults, "maj"),
3192 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3193 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3194 OPT_CALLBACK(0, "call-graph", &trace.opts,
3195 "record_mode[,record_size]", record_callchain_help,
3196 &record_parse_callchain_opt),
3197 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3198 "Show the kernel callchains on the syscall exit path"),
3199 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3200 "Set the minimum stack depth when parsing the callchain, "
3201 "anything below the specified depth will be ignored."),
3202 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3203 "Set the maximum stack depth when parsing the callchain, "
3204 "anything beyond the specified depth will be ignored. "
3205 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3206 OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3207 "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3208 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3209 "per thread proc mmap processing timeout in ms"),
3210 OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3211 trace__parse_cgroups),
3212 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3213 "ms to wait before starting measurement after program "
3217 bool __maybe_unused max_stack_user_set = true;
3218 bool mmap_pages_user_set = true;
3219 struct perf_evsel *evsel;
3220 const char * const trace_subcommands[] = { "record", NULL };
3224 signal(SIGSEGV, sighandler_dump_stack);
3225 signal(SIGFPE, sighandler_dump_stack);
3227 trace.evlist = perf_evlist__new();
3228 trace.sctbl = syscalltbl__new();
3230 if (trace.evlist == NULL || trace.sctbl == NULL) {
3231 pr_err("Not enough memory to run!\n");
3236 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3237 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3239 if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3240 usage_with_options_msg(trace_usage, trace_options,
3241 "cgroup monitoring only available in system-wide mode");
3244 evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__");
3245 if (IS_ERR(evsel)) {
3246 bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf));
3247 pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf);
3251 err = bpf__setup_stdout(trace.evlist);
3253 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3254 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3260 if (trace.trace_pgfaults) {
3261 trace.opts.sample_address = true;
3262 trace.opts.sample_time = true;
3265 if (trace.opts.mmap_pages == UINT_MAX)
3266 mmap_pages_user_set = false;
3268 if (trace.max_stack == UINT_MAX) {
3269 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3270 max_stack_user_set = false;
3273 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3274 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3275 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3279 if (callchain_param.enabled) {
3280 if (!mmap_pages_user_set && geteuid() == 0)
3281 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3283 symbol_conf.use_callchain = true;
3286 if (trace.evlist->nr_entries > 0) {
3287 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3288 if (evlist__set_syscall_tp_fields(trace.evlist)) {
3289 perror("failed to set syscalls:* tracepoint fields");
3294 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3295 return trace__record(&trace, argc-1, &argv[1]);
3297 /* summary_only implies summary option, but don't overwrite summary if set */
3298 if (trace.summary_only)
3299 trace.summary = trace.summary_only;
3301 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3302 trace.evlist->nr_entries == 0 /* Was --events used? */) {
3303 trace.trace_syscalls = true;
3306 if (output_name != NULL) {
3307 err = trace__open_output(&trace, output_name);
3309 perror("failed to create output file");
3314 err = target__validate(&trace.opts.target);
3316 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3317 fprintf(trace.output, "%s", bf);
3321 err = target__parse_uid(&trace.opts.target);
3323 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3324 fprintf(trace.output, "%s", bf);
3328 if (!argc && target__none(&trace.opts.target))
3329 trace.opts.target.system_wide = true;
3332 err = trace__replay(&trace);
3334 err = trace__run(&trace, argc, argv);
3337 if (output_name != NULL)
3338 fclose(trace.output);