4 * Builtin 'trace' command:
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
16 * Released under the GPL v2. (and only v2, not any later version)
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/event.h"
25 #include "util/evlist.h"
26 #include <subcmd/exec-cmd.h>
27 #include "util/machine.h"
28 #include "util/path.h"
29 #include "util/session.h"
30 #include "util/thread.h"
31 #include <subcmd/parse-options.h>
32 #include "util/strlist.h"
33 #include "util/intlist.h"
34 #include "util/thread_map.h"
35 #include "util/stat.h"
36 #include "trace/beauty/beauty.h"
37 #include "trace-event.h"
38 #include "util/parse-events.h"
39 #include "util/bpf-loader.h"
40 #include "callchain.h"
41 #include "print_binary.h"
43 #include "syscalltbl.h"
44 #include "rb_resort.h"
48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/audit.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
61 #include "sane_ctype.h"
64 # define O_CLOEXEC 02000000
67 #ifndef F_LINUX_SPECIFIC_BASE
68 # define F_LINUX_SPECIFIC_BASE 1024
72 struct perf_tool tool;
73 struct syscalltbl *sctbl;
76 struct syscall *table;
78 struct perf_evsel *sys_enter,
82 struct record_opts opts;
83 struct perf_evlist *evlist;
85 struct thread *current;
88 unsigned long nr_events;
89 struct strlist *ev_qualifier;
98 double duration_filter;
104 unsigned int max_stack;
105 unsigned int min_stack;
106 bool not_ev_qualifier;
110 bool multiple_threads;
114 bool show_tool_stats;
116 bool kernel_syscallchains;
126 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
127 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
131 #define TP_UINT_FIELD(bits) \
132 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
135 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
144 #define TP_UINT_FIELD__SWAPPED(bits) \
145 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
148 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
149 return bswap_##bits(value);\
152 TP_UINT_FIELD__SWAPPED(16);
153 TP_UINT_FIELD__SWAPPED(32);
154 TP_UINT_FIELD__SWAPPED(64);
156 static int tp_field__init_uint(struct tp_field *field,
157 struct format_field *format_field,
160 field->offset = format_field->offset;
162 switch (format_field->size) {
164 field->integer = tp_field__u8;
167 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
170 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
173 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
182 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
184 return sample->raw_data + field->offset;
187 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
189 field->offset = format_field->offset;
190 field->pointer = tp_field__ptr;
197 struct tp_field args, ret;
201 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
202 struct tp_field *field,
205 struct format_field *format_field = perf_evsel__field(evsel, name);
207 if (format_field == NULL)
210 return tp_field__init_uint(field, format_field, evsel->needs_swap);
213 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
214 ({ struct syscall_tp *sc = evsel->priv;\
215 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
217 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
218 struct tp_field *field,
221 struct format_field *format_field = perf_evsel__field(evsel, name);
223 if (format_field == NULL)
226 return tp_field__init_ptr(field, format_field);
229 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
230 ({ struct syscall_tp *sc = evsel->priv;\
231 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
233 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
236 perf_evsel__delete(evsel);
239 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
241 evsel->priv = malloc(sizeof(struct syscall_tp));
242 if (evsel->priv != NULL) {
243 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
246 evsel->handler = handler;
257 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
259 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
261 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
263 evsel = perf_evsel__newtp("syscalls", direction);
268 if (perf_evsel__init_syscall_tp(evsel, handler))
274 perf_evsel__delete_priv(evsel);
278 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
279 ({ struct syscall_tp *fields = evsel->priv; \
280 fields->name.integer(&fields->name, sample); })
282 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
283 ({ struct syscall_tp *fields = evsel->priv; \
284 fields->name.pointer(&fields->name, sample); })
286 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
288 int idx = val - sa->offset;
290 if (idx < 0 || idx >= sa->nr_entries)
291 return scnprintf(bf, size, intfmt, val);
293 return scnprintf(bf, size, "%s", sa->entries[idx]);
296 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
298 struct syscall_arg *arg)
300 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
303 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
304 struct syscall_arg *arg)
306 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
309 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
313 struct strarray **entries;
316 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
317 .nr_entries = ARRAY_SIZE(array), \
321 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
322 struct syscall_arg *arg)
324 struct strarrays *sas = arg->parm;
327 for (i = 0; i < sas->nr_entries; ++i) {
328 struct strarray *sa = sas->entries[i];
329 int idx = arg->val - sa->offset;
331 if (idx >= 0 && idx < sa->nr_entries) {
332 if (sa->entries[idx] == NULL)
334 return scnprintf(bf, size, "%s", sa->entries[idx]);
338 return scnprintf(bf, size, "%d", arg->val);
342 #define AT_FDCWD -100
345 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
346 struct syscall_arg *arg)
351 return scnprintf(bf, size, "CWD");
353 return syscall_arg__scnprintf_fd(bf, size, arg);
356 #define SCA_FDAT syscall_arg__scnprintf_fd_at
358 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
359 struct syscall_arg *arg);
361 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
363 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
365 return scnprintf(bf, size, "%#lx", arg->val);
368 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
370 return scnprintf(bf, size, "%d", arg->val);
373 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
375 return scnprintf(bf, size, "%ld", arg->val);
378 static const char *bpf_cmd[] = {
379 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
380 "MAP_GET_NEXT_KEY", "PROG_LOAD",
382 static DEFINE_STRARRAY(bpf_cmd);
384 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
385 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
387 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
388 static DEFINE_STRARRAY(itimers);
390 static const char *keyctl_options[] = {
391 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
392 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
393 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
394 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
395 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
397 static DEFINE_STRARRAY(keyctl_options);
399 static const char *whences[] = { "SET", "CUR", "END",
407 static DEFINE_STRARRAY(whences);
409 static const char *fcntl_cmds[] = {
410 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
411 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
412 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
415 static DEFINE_STRARRAY(fcntl_cmds);
417 static const char *fcntl_linux_specific_cmds[] = {
418 "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
419 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
420 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
423 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
425 static struct strarray *fcntl_cmds_arrays[] = {
426 &strarray__fcntl_cmds,
427 &strarray__fcntl_linux_specific_cmds,
430 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
432 static const char *rlimit_resources[] = {
433 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
434 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
437 static DEFINE_STRARRAY(rlimit_resources);
439 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
440 static DEFINE_STRARRAY(sighow);
442 static const char *clockid[] = {
443 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
444 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
445 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
447 static DEFINE_STRARRAY(clockid);
449 static const char *socket_families[] = {
450 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
451 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
452 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
453 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
454 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
455 "ALG", "NFC", "VSOCK",
457 static DEFINE_STRARRAY(socket_families);
459 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
460 struct syscall_arg *arg)
465 if (mode == F_OK) /* 0 */
466 return scnprintf(bf, size, "F");
468 if (mode & n##_OK) { \
469 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
479 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
484 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
486 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
487 struct syscall_arg *arg);
489 #define SCA_FILENAME syscall_arg__scnprintf_filename
491 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
492 struct syscall_arg *arg)
494 int printed = 0, flags = arg->val;
497 if (flags & O_##n) { \
498 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
507 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
512 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
514 #ifndef GRND_NONBLOCK
515 #define GRND_NONBLOCK 0x0001
518 #define GRND_RANDOM 0x0002
521 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
522 struct syscall_arg *arg)
524 int printed = 0, flags = arg->val;
527 if (flags & GRND_##n) { \
528 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
529 flags &= ~GRND_##n; \
537 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
542 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
544 #define STRARRAY(name, array) \
545 { .scnprintf = SCA_STRARRAY, \
546 .parm = &strarray__##array, }
548 #include "trace/beauty/eventfd.c"
549 #include "trace/beauty/flock.c"
550 #include "trace/beauty/futex_op.c"
551 #include "trace/beauty/mmap.c"
552 #include "trace/beauty/mode_t.c"
553 #include "trace/beauty/msg_flags.c"
554 #include "trace/beauty/open_flags.c"
555 #include "trace/beauty/perf_event_open.c"
556 #include "trace/beauty/pid.c"
557 #include "trace/beauty/sched_policy.c"
558 #include "trace/beauty/seccomp.c"
559 #include "trace/beauty/signum.c"
560 #include "trace/beauty/socket_type.c"
561 #include "trace/beauty/waitid_options.c"
563 struct syscall_arg_fmt {
564 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
570 static struct syscall_fmt {
573 struct syscall_arg_fmt arg[6];
580 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
581 { .name = "arch_prctl", .alias = "prctl", },
583 .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
584 { .name = "brk", .hexret = true,
585 .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
586 { .name = "clock_gettime",
587 .arg = { [0] = STRARRAY(clk_id, clockid), }, },
588 { .name = "clone", .errpid = true, .nr_args = 5,
589 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, },
590 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, },
591 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
592 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, },
593 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, },
595 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
596 { .name = "epoll_ctl",
597 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
598 { .name = "eventfd2",
599 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
600 { .name = "fchmodat",
601 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
602 { .name = "fchownat",
603 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
605 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
606 .parm = &strarrays__fcntl_cmds_arrays,
607 .show_zero = true, },
608 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, },
610 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
611 { .name = "fstat", .alias = "newfstat", },
612 { .name = "fstatat", .alias = "newfstatat", },
614 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, }, },
615 { .name = "futimesat",
616 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
617 { .name = "getitimer",
618 .arg = { [0] = STRARRAY(which, itimers), }, },
619 { .name = "getpid", .errpid = true, },
620 { .name = "getpgid", .errpid = true, },
621 { .name = "getppid", .errpid = true, },
622 { .name = "getrandom",
623 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
624 { .name = "getrlimit",
625 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
628 #if defined(__i386__) || defined(__x86_64__)
630 * FIXME: Make this available to all arches.
632 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
633 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
635 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
638 .arg = { [0] = STRARRAY(option, keyctl_options), }, },
640 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
642 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
644 .arg = { [2] = STRARRAY(whence, whences), }, },
645 { .name = "lstat", .alias = "newlstat", },
647 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
648 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
650 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
652 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
654 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
655 { .name = "mlockall",
656 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
657 { .name = "mmap", .hexret = true,
658 /* The standard mmap maps to old_mmap on s390x */
659 #if defined(__s390x__)
662 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
663 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
664 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
665 { .name = "mprotect",
666 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
667 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, },
668 { .name = "mq_unlink",
669 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
670 { .name = "mremap", .hexret = true,
671 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
672 [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
673 [4] = { .scnprintf = SCA_HEX, /* new_addr */ }, }, },
675 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
677 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
678 { .name = "name_to_handle_at",
679 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
680 { .name = "newfstatat",
681 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
683 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
684 { .name = "open_by_handle_at",
685 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
686 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
688 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
689 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
690 { .name = "perf_event_open",
691 .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ },
692 [3] = { .scnprintf = SCA_FD, /* group_fd */ },
693 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
695 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
696 { .name = "pkey_alloc",
697 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, },
698 { .name = "pkey_free",
699 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, },
700 { .name = "pkey_mprotect",
701 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
702 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
703 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, },
704 { .name = "poll", .timeout = true, },
705 { .name = "ppoll", .timeout = true, },
706 { .name = "pread", .alias = "pread64", },
707 { .name = "preadv", .alias = "pread", },
708 { .name = "prlimit64",
709 .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
710 { .name = "pwrite", .alias = "pwrite64", },
711 { .name = "readlinkat",
712 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
713 { .name = "recvfrom",
714 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
715 { .name = "recvmmsg",
716 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
718 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
719 { .name = "renameat",
720 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
721 { .name = "rt_sigaction",
722 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
723 { .name = "rt_sigprocmask",
724 .arg = { [0] = STRARRAY(how, sighow), }, },
725 { .name = "rt_sigqueueinfo",
726 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
727 { .name = "rt_tgsigqueueinfo",
728 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
729 { .name = "sched_setscheduler",
730 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
732 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ },
733 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
734 { .name = "select", .timeout = true, },
735 { .name = "sendmmsg",
736 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
738 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
740 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
741 { .name = "set_tid_address", .errpid = true, },
742 { .name = "setitimer",
743 .arg = { [0] = STRARRAY(which, itimers), }, },
744 { .name = "setrlimit",
745 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
747 .arg = { [0] = STRARRAY(family, socket_families),
748 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
749 { .name = "socketpair",
750 .arg = { [0] = STRARRAY(family, socket_families),
751 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
752 { .name = "stat", .alias = "newstat", },
754 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ },
755 [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
756 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, },
758 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
760 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
761 { .name = "symlinkat",
762 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
764 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
766 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
767 { .name = "uname", .alias = "newuname", },
768 { .name = "unlinkat",
769 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
770 { .name = "utimensat",
771 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
772 { .name = "wait4", .errpid = true,
773 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
774 { .name = "waitid", .errpid = true,
775 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
778 static int syscall_fmt__cmp(const void *name, const void *fmtp)
780 const struct syscall_fmt *fmt = fmtp;
781 return strcmp(name, fmt->name);
784 static struct syscall_fmt *syscall_fmt__find(const char *name)
786 const int nmemb = ARRAY_SIZE(syscall_fmts);
787 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
791 struct event_format *tp_format;
793 struct format_field *args;
796 struct syscall_fmt *fmt;
797 struct syscall_arg_fmt *arg_fmt;
801 * We need to have this 'calculated' boolean because in some cases we really
802 * don't know what is the duration of a syscall, for instance, when we start
803 * a session and some threads are waiting for a syscall to finish, say 'poll',
804 * in which case all we can do is to print "( ? ) for duration and for the
807 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
809 double duration = (double)t / NSEC_PER_MSEC;
810 size_t printed = fprintf(fp, "(");
813 printed += fprintf(fp, " ? ");
814 else if (duration >= 1.0)
815 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
816 else if (duration >= 0.01)
817 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
819 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
820 return printed + fprintf(fp, "): ");
824 * filename.ptr: The filename char pointer that will be vfs_getname'd
825 * filename.entry_str_pos: Where to insert the string translated from
826 * filename.ptr by the vfs_getname tracepoint/kprobe.
827 * ret_scnprintf: syscall args may set this to a different syscall return
828 * formatter, for instance, fcntl may return fds, file flags, etc.
830 struct thread_trace {
833 unsigned long nr_events;
834 unsigned long pfmaj, pfmin;
837 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
840 short int entry_str_pos;
842 unsigned int namelen;
850 struct intlist *syscall_stats;
853 static struct thread_trace *thread_trace__new(void)
855 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
858 ttrace->paths.max = -1;
860 ttrace->syscall_stats = intlist__new(NULL);
865 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
867 struct thread_trace *ttrace;
872 if (thread__priv(thread) == NULL)
873 thread__set_priv(thread, thread_trace__new());
875 if (thread__priv(thread) == NULL)
878 ttrace = thread__priv(thread);
883 color_fprintf(fp, PERF_COLOR_RED,
884 "WARNING: not enough memory, dropping samples!\n");
889 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
890 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
892 struct thread_trace *ttrace = thread__priv(arg->thread);
894 ttrace->ret_scnprintf = ret_scnprintf;
897 #define TRACE_PFMAJ (1 << 0)
898 #define TRACE_PFMIN (1 << 1)
900 static const size_t trace__entry_str_size = 2048;
902 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
904 struct thread_trace *ttrace = thread__priv(thread);
906 if (fd > ttrace->paths.max) {
907 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
912 if (ttrace->paths.max != -1) {
913 memset(npath + ttrace->paths.max + 1, 0,
914 (fd - ttrace->paths.max) * sizeof(char *));
916 memset(npath, 0, (fd + 1) * sizeof(char *));
919 ttrace->paths.table = npath;
920 ttrace->paths.max = fd;
923 ttrace->paths.table[fd] = strdup(pathname);
925 return ttrace->paths.table[fd] != NULL ? 0 : -1;
928 static int thread__read_fd_path(struct thread *thread, int fd)
930 char linkname[PATH_MAX], pathname[PATH_MAX];
934 if (thread->pid_ == thread->tid) {
935 scnprintf(linkname, sizeof(linkname),
936 "/proc/%d/fd/%d", thread->pid_, fd);
938 scnprintf(linkname, sizeof(linkname),
939 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
942 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
945 ret = readlink(linkname, pathname, sizeof(pathname));
947 if (ret < 0 || ret > st.st_size)
950 pathname[ret] = '\0';
951 return trace__set_fd_pathname(thread, fd, pathname);
954 static const char *thread__fd_path(struct thread *thread, int fd,
957 struct thread_trace *ttrace = thread__priv(thread);
965 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
968 ++trace->stats.proc_getname;
969 if (thread__read_fd_path(thread, fd))
973 return ttrace->paths.table[fd];
976 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
979 size_t printed = scnprintf(bf, size, "%d", fd);
980 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
983 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
988 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
989 struct syscall_arg *arg)
992 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
993 struct thread_trace *ttrace = thread__priv(arg->thread);
995 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
996 zfree(&ttrace->paths.table[fd]);
1001 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1004 struct thread_trace *ttrace = thread__priv(thread);
1006 ttrace->filename.ptr = ptr;
1007 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1010 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1011 struct syscall_arg *arg)
1013 unsigned long ptr = arg->val;
1015 if (!arg->trace->vfs_getname)
1016 return scnprintf(bf, size, "%#x", ptr);
1018 thread__set_filename_pos(arg->thread, bf, ptr);
1022 static bool trace__filter_duration(struct trace *trace, double t)
1024 return t < (trace->duration_filter * NSEC_PER_MSEC);
1027 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1029 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1031 return fprintf(fp, "%10.3f ", ts);
1035 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1036 * using ttrace->entry_time for a thread that receives a sys_exit without
1037 * first having received a sys_enter ("poll" issued before tracing session
1038 * starts, lost sys_enter exit due to ring buffer overflow).
1040 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1043 return __trace__fprintf_tstamp(trace, tstamp, fp);
1045 return fprintf(fp, " ? ");
1048 static bool done = false;
1049 static bool interrupted = false;
1051 static void sig_handler(int sig)
1054 interrupted = sig == SIGINT;
1057 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1058 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1060 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1061 printed += fprintf_duration(duration, duration_calculated, fp);
1063 if (trace->multiple_threads) {
1064 if (trace->show_comm)
1065 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1066 printed += fprintf(fp, "%d ", thread->tid);
1072 static int trace__process_event(struct trace *trace, struct machine *machine,
1073 union perf_event *event, struct perf_sample *sample)
1077 switch (event->header.type) {
1078 case PERF_RECORD_LOST:
1079 color_fprintf(trace->output, PERF_COLOR_RED,
1080 "LOST %" PRIu64 " events!\n", event->lost.lost);
1081 ret = machine__process_lost_event(machine, event, sample);
1084 ret = machine__process_event(machine, event, sample);
1091 static int trace__tool_process(struct perf_tool *tool,
1092 union perf_event *event,
1093 struct perf_sample *sample,
1094 struct machine *machine)
1096 struct trace *trace = container_of(tool, struct trace, tool);
1097 return trace__process_event(trace, machine, event, sample);
1100 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1102 struct machine *machine = vmachine;
1104 if (machine->kptr_restrict_warned)
1107 if (symbol_conf.kptr_restrict) {
1108 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1109 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1110 "Kernel samples will not be resolved.\n");
1111 machine->kptr_restrict_warned = true;
1115 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1118 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1120 int err = symbol__init(NULL);
1125 trace->host = machine__new_host();
1126 if (trace->host == NULL)
1129 if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1132 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1133 evlist->threads, trace__tool_process, false,
1134 trace->opts.proc_map_timeout);
1141 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1145 if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1146 nr_args = sc->fmt->nr_args;
1148 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1149 if (sc->arg_fmt == NULL)
1152 for (idx = 0; idx < nr_args; ++idx) {
1154 sc->arg_fmt[idx] = sc->fmt->arg[idx];
1157 sc->nr_args = nr_args;
1161 static int syscall__set_arg_fmts(struct syscall *sc)
1163 struct format_field *field;
1166 for (field = sc->args; field; field = field->next, ++idx) {
1167 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1170 if (strcmp(field->type, "const char *") == 0 &&
1171 (strcmp(field->name, "filename") == 0 ||
1172 strcmp(field->name, "path") == 0 ||
1173 strcmp(field->name, "pathname") == 0))
1174 sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1175 else if (field->flags & FIELD_IS_POINTER)
1176 sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1177 else if (strcmp(field->type, "pid_t") == 0)
1178 sc->arg_fmt[idx].scnprintf = SCA_PID;
1179 else if (strcmp(field->type, "umode_t") == 0)
1180 sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1181 else if ((strcmp(field->type, "int") == 0 ||
1182 strcmp(field->type, "unsigned int") == 0 ||
1183 strcmp(field->type, "long") == 0) &&
1184 (len = strlen(field->name)) >= 2 &&
1185 strcmp(field->name + len - 2, "fd") == 0) {
1187 * /sys/kernel/tracing/events/syscalls/sys_enter*
1188 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1193 sc->arg_fmt[idx].scnprintf = SCA_FD;
1200 static int trace__read_syscall_info(struct trace *trace, int id)
1204 const char *name = syscalltbl__name(trace->sctbl, id);
1209 if (id > trace->syscalls.max) {
1210 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1212 if (nsyscalls == NULL)
1215 if (trace->syscalls.max != -1) {
1216 memset(nsyscalls + trace->syscalls.max + 1, 0,
1217 (id - trace->syscalls.max) * sizeof(*sc));
1219 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1222 trace->syscalls.table = nsyscalls;
1223 trace->syscalls.max = id;
1226 sc = trace->syscalls.table + id;
1229 sc->fmt = syscall_fmt__find(sc->name);
1231 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1232 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1234 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1235 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1236 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1239 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1242 if (IS_ERR(sc->tp_format))
1245 sc->args = sc->tp_format->format.fields;
1247 * We need to check and discard the first variable '__syscall_nr'
1248 * or 'nr' that mean the syscall number. It is needless here.
1249 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1251 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1252 sc->args = sc->args->next;
1256 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1258 return syscall__set_arg_fmts(sc);
1261 static int trace__validate_ev_qualifier(struct trace *trace)
1264 struct str_node *pos;
1266 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1267 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1268 sizeof(trace->ev_qualifier_ids.entries[0]));
1270 if (trace->ev_qualifier_ids.entries == NULL) {
1271 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1279 strlist__for_each_entry(pos, trace->ev_qualifier) {
1280 const char *sc = pos->s;
1281 int id = syscalltbl__id(trace->sctbl, sc);
1285 fputs("Error:\tInvalid syscall ", trace->output);
1288 fputs(", ", trace->output);
1291 fputs(sc, trace->output);
1294 trace->ev_qualifier_ids.entries[i++] = id;
1298 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1299 "\nHint:\tand: 'man syscalls'\n", trace->output);
1300 zfree(&trace->ev_qualifier_ids.entries);
1301 trace->ev_qualifier_ids.nr = 0;
1308 * args is to be interpreted as a series of longs but we need to handle
1309 * 8-byte unaligned accesses. args points to raw_data within the event
1310 * and raw_data is guaranteed to be 8-byte unaligned because it is
1311 * preceded by raw_size which is a u32. So we need to copy args to a temp
1312 * variable to read it. Most notably this avoids extended load instructions
1313 * on unaligned addresses
1315 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1318 unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1320 memcpy(&val, p, sizeof(val));
1324 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1325 struct syscall_arg *arg)
1327 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1328 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1330 return scnprintf(bf, size, "arg%d: ", arg->idx);
1333 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1334 struct syscall_arg *arg, unsigned long val)
1336 if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1338 if (sc->arg_fmt[arg->idx].parm)
1339 arg->parm = sc->arg_fmt[arg->idx].parm;
1340 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1342 return scnprintf(bf, size, "%ld", val);
1345 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1346 unsigned char *args, struct trace *trace,
1347 struct thread *thread)
1352 struct syscall_arg arg = {
1359 struct thread_trace *ttrace = thread__priv(thread);
1362 * Things like fcntl will set this in its 'cmd' formatter to pick the
1363 * right formatter for the return value (an fd? file flags?), which is
1364 * not needed for syscalls that always return a given type, say an fd.
1366 ttrace->ret_scnprintf = NULL;
1368 if (sc->args != NULL) {
1369 struct format_field *field;
1371 for (field = sc->args; field;
1372 field = field->next, ++arg.idx, bit <<= 1) {
1376 val = syscall_arg__val(&arg, arg.idx);
1379 * Suppress this argument if its value is zero and
1380 * and we don't have a string associated in an
1385 (sc->arg_fmt[arg.idx].show_zero ||
1386 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1387 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1388 sc->arg_fmt[arg.idx].parm))
1391 printed += scnprintf(bf + printed, size - printed,
1392 "%s%s: ", printed ? ", " : "", field->name);
1393 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1395 } else if (IS_ERR(sc->tp_format)) {
1397 * If we managed to read the tracepoint /format file, then we
1398 * may end up not having any args, like with gettid(), so only
1399 * print the raw args when we didn't manage to read it.
1401 while (arg.idx < sc->nr_args) {
1404 val = syscall_arg__val(&arg, arg.idx);
1406 printed += scnprintf(bf + printed, size - printed, ", ");
1407 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1408 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1418 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1419 union perf_event *event,
1420 struct perf_sample *sample);
1422 static struct syscall *trace__syscall_info(struct trace *trace,
1423 struct perf_evsel *evsel, int id)
1429 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1430 * before that, leaving at a higher verbosity level till that is
1431 * explained. Reproduced with plain ftrace with:
1433 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1434 * grep "NR -1 " /t/trace_pipe
1436 * After generating some load on the machine.
1440 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1441 id, perf_evsel__name(evsel), ++n);
1446 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1447 trace__read_syscall_info(trace, id))
1450 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1453 return &trace->syscalls.table[id];
1457 fprintf(trace->output, "Problems reading syscall %d", id);
1458 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1459 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1460 fputs(" information\n", trace->output);
1465 static void thread__update_stats(struct thread_trace *ttrace,
1466 int id, struct perf_sample *sample)
1468 struct int_node *inode;
1469 struct stats *stats;
1472 inode = intlist__findnew(ttrace->syscall_stats, id);
1476 stats = inode->priv;
1477 if (stats == NULL) {
1478 stats = malloc(sizeof(struct stats));
1482 inode->priv = stats;
1485 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1486 duration = sample->time - ttrace->entry_time;
1488 update_stats(stats, duration);
1491 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1493 struct thread_trace *ttrace;
1497 if (trace->current == NULL)
1500 ttrace = thread__priv(trace->current);
1502 if (!ttrace->entry_pending)
1505 duration = sample->time - ttrace->entry_time;
1507 printed = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1508 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1509 ttrace->entry_pending = false;
1514 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1515 union perf_event *event __maybe_unused,
1516 struct perf_sample *sample)
1521 struct thread *thread;
1522 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1523 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1524 struct thread_trace *ttrace;
1529 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1530 ttrace = thread__trace(thread, trace->output);
1534 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1536 if (ttrace->entry_str == NULL) {
1537 ttrace->entry_str = malloc(trace__entry_str_size);
1538 if (!ttrace->entry_str)
1542 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1543 trace__printf_interrupted_entry(trace, sample);
1545 ttrace->entry_time = sample->time;
1546 msg = ttrace->entry_str;
1547 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1549 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1550 args, trace, thread);
1553 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1554 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1555 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1558 ttrace->entry_pending = true;
1559 /* See trace__vfs_getname & trace__sys_exit */
1560 ttrace->filename.pending_open = false;
1563 if (trace->current != thread) {
1564 thread__put(trace->current);
1565 trace->current = thread__get(thread);
1569 thread__put(thread);
1573 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1574 struct perf_sample *sample,
1575 struct callchain_cursor *cursor)
1577 struct addr_location al;
1579 if (machine__resolve(trace->host, &al, sample) < 0 ||
1580 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1586 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1588 /* TODO: user-configurable print_opts */
1589 const unsigned int print_opts = EVSEL__PRINT_SYM |
1591 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1593 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1596 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1597 union perf_event *event __maybe_unused,
1598 struct perf_sample *sample)
1602 bool duration_calculated = false;
1603 struct thread *thread;
1604 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1605 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1606 struct thread_trace *ttrace;
1611 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1612 ttrace = thread__trace(thread, trace->output);
1617 thread__update_stats(ttrace, id, sample);
1619 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1621 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1622 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1623 ttrace->filename.pending_open = false;
1624 ++trace->stats.vfs_getname;
1627 if (ttrace->entry_time) {
1628 duration = sample->time - ttrace->entry_time;
1629 if (trace__filter_duration(trace, duration))
1631 duration_calculated = true;
1632 } else if (trace->duration_filter)
1635 if (sample->callchain) {
1636 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1637 if (callchain_ret == 0) {
1638 if (callchain_cursor.nr < trace->min_stack)
1644 if (trace->summary_only)
1647 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1649 if (ttrace->entry_pending) {
1650 fprintf(trace->output, "%-70s", ttrace->entry_str);
1652 fprintf(trace->output, " ... [");
1653 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1654 fprintf(trace->output, "]: %s()", sc->name);
1657 if (sc->fmt == NULL) {
1661 fprintf(trace->output, ") = %ld", ret);
1662 } else if (ret < 0) {
1664 char bf[STRERR_BUFSIZE];
1665 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1666 *e = audit_errno_to_name(-ret);
1668 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1670 } else if (ret == 0 && sc->fmt->timeout)
1671 fprintf(trace->output, ") = 0 Timeout");
1672 else if (ttrace->ret_scnprintf) {
1674 struct syscall_arg arg = {
1679 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1680 ttrace->ret_scnprintf = NULL;
1681 fprintf(trace->output, ") = %s", bf);
1682 } else if (sc->fmt->hexret)
1683 fprintf(trace->output, ") = %#lx", ret);
1684 else if (sc->fmt->errpid) {
1685 struct thread *child = machine__find_thread(trace->host, ret, ret);
1687 if (child != NULL) {
1688 fprintf(trace->output, ") = %ld", ret);
1689 if (child->comm_set)
1690 fprintf(trace->output, " (%s)", thread__comm_str(child));
1696 fputc('\n', trace->output);
1698 if (callchain_ret > 0)
1699 trace__fprintf_callchain(trace, sample);
1700 else if (callchain_ret < 0)
1701 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1703 ttrace->entry_pending = false;
1706 thread__put(thread);
1710 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1711 union perf_event *event __maybe_unused,
1712 struct perf_sample *sample)
1714 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1715 struct thread_trace *ttrace;
1716 size_t filename_len, entry_str_len, to_move;
1717 ssize_t remaining_space;
1719 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1724 ttrace = thread__priv(thread);
1728 filename_len = strlen(filename);
1729 if (filename_len == 0)
1732 if (ttrace->filename.namelen < filename_len) {
1733 char *f = realloc(ttrace->filename.name, filename_len + 1);
1738 ttrace->filename.namelen = filename_len;
1739 ttrace->filename.name = f;
1742 strcpy(ttrace->filename.name, filename);
1743 ttrace->filename.pending_open = true;
1745 if (!ttrace->filename.ptr)
1748 entry_str_len = strlen(ttrace->entry_str);
1749 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1750 if (remaining_space <= 0)
1753 if (filename_len > (size_t)remaining_space) {
1754 filename += filename_len - remaining_space;
1755 filename_len = remaining_space;
1758 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1759 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1760 memmove(pos + filename_len, pos, to_move);
1761 memcpy(pos, filename, filename_len);
1763 ttrace->filename.ptr = 0;
1764 ttrace->filename.entry_str_pos = 0;
1766 thread__put(thread);
1771 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1772 union perf_event *event __maybe_unused,
1773 struct perf_sample *sample)
1775 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1776 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1777 struct thread *thread = machine__findnew_thread(trace->host,
1780 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1785 ttrace->runtime_ms += runtime_ms;
1786 trace->runtime_ms += runtime_ms;
1788 thread__put(thread);
1792 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1794 perf_evsel__strval(evsel, sample, "comm"),
1795 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1797 perf_evsel__intval(evsel, sample, "vruntime"));
1801 static void bpf_output__printer(enum binary_printer_ops op,
1802 unsigned int val, void *extra)
1804 FILE *output = extra;
1805 unsigned char ch = (unsigned char)val;
1808 case BINARY_PRINT_CHAR_DATA:
1809 fprintf(output, "%c", isprint(ch) ? ch : '.');
1811 case BINARY_PRINT_DATA_BEGIN:
1812 case BINARY_PRINT_LINE_BEGIN:
1813 case BINARY_PRINT_ADDR:
1814 case BINARY_PRINT_NUM_DATA:
1815 case BINARY_PRINT_NUM_PAD:
1816 case BINARY_PRINT_SEP:
1817 case BINARY_PRINT_CHAR_PAD:
1818 case BINARY_PRINT_LINE_END:
1819 case BINARY_PRINT_DATA_END:
1825 static void bpf_output__fprintf(struct trace *trace,
1826 struct perf_sample *sample)
1828 print_binary(sample->raw_data, sample->raw_size, 8,
1829 bpf_output__printer, trace->output);
1832 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1833 union perf_event *event __maybe_unused,
1834 struct perf_sample *sample)
1836 int callchain_ret = 0;
1838 if (sample->callchain) {
1839 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1840 if (callchain_ret == 0) {
1841 if (callchain_cursor.nr < trace->min_stack)
1847 trace__printf_interrupted_entry(trace, sample);
1848 trace__fprintf_tstamp(trace, sample->time, trace->output);
1850 if (trace->trace_syscalls)
1851 fprintf(trace->output, "( ): ");
1853 fprintf(trace->output, "%s:", evsel->name);
1855 if (perf_evsel__is_bpf_output(evsel)) {
1856 bpf_output__fprintf(trace, sample);
1857 } else if (evsel->tp_format) {
1858 event_format__fprintf(evsel->tp_format, sample->cpu,
1859 sample->raw_data, sample->raw_size,
1863 fprintf(trace->output, ")\n");
1865 if (callchain_ret > 0)
1866 trace__fprintf_callchain(trace, sample);
1867 else if (callchain_ret < 0)
1868 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1873 static void print_location(FILE *f, struct perf_sample *sample,
1874 struct addr_location *al,
1875 bool print_dso, bool print_sym)
1878 if ((verbose > 0 || print_dso) && al->map)
1879 fprintf(f, "%s@", al->map->dso->long_name);
1881 if ((verbose > 0 || print_sym) && al->sym)
1882 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1883 al->addr - al->sym->start);
1885 fprintf(f, "0x%" PRIx64, al->addr);
1887 fprintf(f, "0x%" PRIx64, sample->addr);
1890 static int trace__pgfault(struct trace *trace,
1891 struct perf_evsel *evsel,
1892 union perf_event *event __maybe_unused,
1893 struct perf_sample *sample)
1895 struct thread *thread;
1896 struct addr_location al;
1897 char map_type = 'd';
1898 struct thread_trace *ttrace;
1900 int callchain_ret = 0;
1902 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1904 if (sample->callchain) {
1905 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1906 if (callchain_ret == 0) {
1907 if (callchain_cursor.nr < trace->min_stack)
1913 ttrace = thread__trace(thread, trace->output);
1917 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1922 if (trace->summary_only)
1925 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1928 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1930 fprintf(trace->output, "%sfault [",
1931 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1934 print_location(trace->output, sample, &al, false, true);
1936 fprintf(trace->output, "] => ");
1938 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1942 thread__find_addr_location(thread, sample->cpumode,
1943 MAP__FUNCTION, sample->addr, &al);
1951 print_location(trace->output, sample, &al, true, false);
1953 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1955 if (callchain_ret > 0)
1956 trace__fprintf_callchain(trace, sample);
1957 else if (callchain_ret < 0)
1958 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1962 thread__put(thread);
1966 static void trace__set_base_time(struct trace *trace,
1967 struct perf_evsel *evsel,
1968 struct perf_sample *sample)
1971 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1972 * and don't use sample->time unconditionally, we may end up having
1973 * some other event in the future without PERF_SAMPLE_TIME for good
1974 * reason, i.e. we may not be interested in its timestamps, just in
1975 * it taking place, picking some piece of information when it
1976 * appears in our event stream (vfs_getname comes to mind).
1978 if (trace->base_time == 0 && !trace->full_time &&
1979 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1980 trace->base_time = sample->time;
1983 static int trace__process_sample(struct perf_tool *tool,
1984 union perf_event *event,
1985 struct perf_sample *sample,
1986 struct perf_evsel *evsel,
1987 struct machine *machine __maybe_unused)
1989 struct trace *trace = container_of(tool, struct trace, tool);
1990 struct thread *thread;
1993 tracepoint_handler handler = evsel->handler;
1995 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1996 if (thread && thread__is_filtered(thread))
1999 trace__set_base_time(trace, evsel, sample);
2003 handler(trace, evsel, event, sample);
2006 thread__put(thread);
2010 static int trace__record(struct trace *trace, int argc, const char **argv)
2012 unsigned int rec_argc, i, j;
2013 const char **rec_argv;
2014 const char * const record_args[] = {
2021 const char * const sc_args[] = { "-e", };
2022 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2023 const char * const majpf_args[] = { "-e", "major-faults" };
2024 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2025 const char * const minpf_args[] = { "-e", "minor-faults" };
2026 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2028 /* +1 is for the event string below */
2029 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2030 majpf_args_nr + minpf_args_nr + argc;
2031 rec_argv = calloc(rec_argc + 1, sizeof(char *));
2033 if (rec_argv == NULL)
2037 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2038 rec_argv[j++] = record_args[i];
2040 if (trace->trace_syscalls) {
2041 for (i = 0; i < sc_args_nr; i++)
2042 rec_argv[j++] = sc_args[i];
2044 /* event string may be different for older kernels - e.g., RHEL6 */
2045 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2046 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2047 else if (is_valid_tracepoint("syscalls:sys_enter"))
2048 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2050 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2055 if (trace->trace_pgfaults & TRACE_PFMAJ)
2056 for (i = 0; i < majpf_args_nr; i++)
2057 rec_argv[j++] = majpf_args[i];
2059 if (trace->trace_pgfaults & TRACE_PFMIN)
2060 for (i = 0; i < minpf_args_nr; i++)
2061 rec_argv[j++] = minpf_args[i];
2063 for (i = 0; i < (unsigned int)argc; i++)
2064 rec_argv[j++] = argv[i];
2066 return cmd_record(j, rec_argv);
2069 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2071 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2073 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2078 if (perf_evsel__field(evsel, "pathname") == NULL) {
2079 perf_evsel__delete(evsel);
2083 evsel->handler = trace__vfs_getname;
2084 perf_evlist__add(evlist, evsel);
2088 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2090 struct perf_evsel *evsel;
2091 struct perf_event_attr attr = {
2092 .type = PERF_TYPE_SOFTWARE,
2096 attr.config = config;
2097 attr.sample_period = 1;
2099 event_attr_init(&attr);
2101 evsel = perf_evsel__new(&attr);
2103 evsel->handler = trace__pgfault;
2108 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2110 const u32 type = event->header.type;
2111 struct perf_evsel *evsel;
2113 if (type != PERF_RECORD_SAMPLE) {
2114 trace__process_event(trace, trace->host, event, sample);
2118 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2119 if (evsel == NULL) {
2120 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2124 trace__set_base_time(trace, evsel, sample);
2126 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2127 sample->raw_data == NULL) {
2128 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2129 perf_evsel__name(evsel), sample->tid,
2130 sample->cpu, sample->raw_size);
2132 tracepoint_handler handler = evsel->handler;
2133 handler(trace, evsel, event, sample);
2137 static int trace__add_syscall_newtp(struct trace *trace)
2140 struct perf_evlist *evlist = trace->evlist;
2141 struct perf_evsel *sys_enter, *sys_exit;
2143 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2144 if (sys_enter == NULL)
2147 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2148 goto out_delete_sys_enter;
2150 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2151 if (sys_exit == NULL)
2152 goto out_delete_sys_enter;
2154 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2155 goto out_delete_sys_exit;
2157 perf_evlist__add(evlist, sys_enter);
2158 perf_evlist__add(evlist, sys_exit);
2160 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2162 * We're interested only in the user space callchain
2163 * leading to the syscall, allow overriding that for
2164 * debugging reasons using --kernel_syscall_callchains
2166 sys_exit->attr.exclude_callchain_kernel = 1;
2169 trace->syscalls.events.sys_enter = sys_enter;
2170 trace->syscalls.events.sys_exit = sys_exit;
2176 out_delete_sys_exit:
2177 perf_evsel__delete_priv(sys_exit);
2178 out_delete_sys_enter:
2179 perf_evsel__delete_priv(sys_enter);
2183 static int trace__set_ev_qualifier_filter(struct trace *trace)
2186 struct perf_evsel *sys_exit;
2187 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2188 trace->ev_qualifier_ids.nr,
2189 trace->ev_qualifier_ids.entries);
2194 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2196 sys_exit = trace->syscalls.events.sys_exit;
2197 err = perf_evsel__append_tp_filter(sys_exit, filter);
2208 static int trace__set_filter_loop_pids(struct trace *trace)
2210 unsigned int nr = 1;
2214 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2216 while (thread && nr < ARRAY_SIZE(pids)) {
2217 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2222 if (!strcmp(thread__comm_str(parent), "sshd")) {
2223 pids[nr++] = parent->tid;
2229 return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2232 static int trace__run(struct trace *trace, int argc, const char **argv)
2234 struct perf_evlist *evlist = trace->evlist;
2235 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2237 unsigned long before;
2238 const bool forks = argc > 0;
2239 bool draining = false;
2243 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2244 goto out_error_raw_syscalls;
2246 if (trace->trace_syscalls)
2247 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2249 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2250 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2251 if (pgfault_maj == NULL)
2253 perf_evlist__add(evlist, pgfault_maj);
2256 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2257 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2258 if (pgfault_min == NULL)
2260 perf_evlist__add(evlist, pgfault_min);
2264 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2265 trace__sched_stat_runtime))
2266 goto out_error_sched_stat_runtime;
2268 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2270 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2271 goto out_delete_evlist;
2274 err = trace__symbols_init(trace, evlist);
2276 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2277 goto out_delete_evlist;
2280 perf_evlist__config(evlist, &trace->opts, NULL);
2282 if (callchain_param.enabled) {
2283 bool use_identifier = false;
2285 if (trace->syscalls.events.sys_exit) {
2286 perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2287 &trace->opts, &callchain_param);
2288 use_identifier = true;
2292 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2293 use_identifier = true;
2297 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2298 use_identifier = true;
2301 if (use_identifier) {
2303 * Now we have evsels with different sample_ids, use
2304 * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2305 * from a fixed position in each ring buffer record.
2307 * As of this the changeset introducing this comment, this
2308 * isn't strictly needed, as the fields that can come before
2309 * PERF_SAMPLE_ID are all used, but we'll probably disable
2310 * some of those for things like copying the payload of
2311 * pointer syscall arguments, and for vfs_getname we don't
2312 * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2313 * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2315 perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2316 perf_evlist__reset_sample_bit(evlist, ID);
2320 signal(SIGCHLD, sig_handler);
2321 signal(SIGINT, sig_handler);
2324 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2327 fprintf(trace->output, "Couldn't run the workload!\n");
2328 goto out_delete_evlist;
2332 err = perf_evlist__open(evlist);
2334 goto out_error_open;
2336 err = bpf__apply_obj_config();
2338 char errbuf[BUFSIZ];
2340 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2341 pr_err("ERROR: Apply config to BPF failed: %s\n",
2343 goto out_error_open;
2347 * Better not use !target__has_task() here because we need to cover the
2348 * case where no threads were specified in the command line, but a
2349 * workload was, and in that case we will fill in the thread_map when
2350 * we fork the workload in perf_evlist__prepare_workload.
2352 if (trace->filter_pids.nr > 0)
2353 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2354 else if (thread_map__pid(evlist->threads, 0) == -1)
2355 err = trace__set_filter_loop_pids(trace);
2360 if (trace->ev_qualifier_ids.nr > 0) {
2361 err = trace__set_ev_qualifier_filter(trace);
2365 pr_debug("event qualifier tracepoint filter: %s\n",
2366 trace->syscalls.events.sys_exit->filter);
2369 err = perf_evlist__apply_filters(evlist, &evsel);
2371 goto out_error_apply_filters;
2373 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2375 goto out_error_mmap;
2377 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2378 perf_evlist__enable(evlist);
2381 perf_evlist__start_workload(evlist);
2383 if (trace->opts.initial_delay) {
2384 usleep(trace->opts.initial_delay * 1000);
2385 perf_evlist__enable(evlist);
2388 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2389 evlist->threads->nr > 1 ||
2390 perf_evlist__first(evlist)->attr.inherit;
2392 before = trace->nr_events;
2394 for (i = 0; i < evlist->nr_mmaps; i++) {
2395 union perf_event *event;
2397 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2398 struct perf_sample sample;
2402 err = perf_evlist__parse_sample(evlist, event, &sample);
2404 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2408 trace__handle_event(trace, event, &sample);
2410 perf_evlist__mmap_consume(evlist, i);
2415 if (done && !draining) {
2416 perf_evlist__disable(evlist);
2422 if (trace->nr_events == before) {
2423 int timeout = done ? 100 : -1;
2425 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2426 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2436 thread__zput(trace->current);
2438 perf_evlist__disable(evlist);
2442 trace__fprintf_thread_summary(trace, trace->output);
2444 if (trace->show_tool_stats) {
2445 fprintf(trace->output, "Stats:\n "
2446 " vfs_getname : %" PRIu64 "\n"
2447 " proc_getname: %" PRIu64 "\n",
2448 trace->stats.vfs_getname,
2449 trace->stats.proc_getname);
2454 perf_evlist__delete(evlist);
2455 trace->evlist = NULL;
2456 trace->live = false;
2459 char errbuf[BUFSIZ];
2461 out_error_sched_stat_runtime:
2462 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2465 out_error_raw_syscalls:
2466 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2470 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2474 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2477 fprintf(trace->output, "%s\n", errbuf);
2478 goto out_delete_evlist;
2480 out_error_apply_filters:
2481 fprintf(trace->output,
2482 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2483 evsel->filter, perf_evsel__name(evsel), errno,
2484 str_error_r(errno, errbuf, sizeof(errbuf)));
2485 goto out_delete_evlist;
2488 fprintf(trace->output, "Not enough memory to run!\n");
2489 goto out_delete_evlist;
2492 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2493 goto out_delete_evlist;
2496 static int trace__replay(struct trace *trace)
2498 const struct perf_evsel_str_handler handlers[] = {
2499 { "probe:vfs_getname", trace__vfs_getname, },
2501 struct perf_data_file file = {
2503 .mode = PERF_DATA_MODE_READ,
2504 .force = trace->force,
2506 struct perf_session *session;
2507 struct perf_evsel *evsel;
2510 trace->tool.sample = trace__process_sample;
2511 trace->tool.mmap = perf_event__process_mmap;
2512 trace->tool.mmap2 = perf_event__process_mmap2;
2513 trace->tool.comm = perf_event__process_comm;
2514 trace->tool.exit = perf_event__process_exit;
2515 trace->tool.fork = perf_event__process_fork;
2516 trace->tool.attr = perf_event__process_attr;
2517 trace->tool.tracing_data = perf_event__process_tracing_data;
2518 trace->tool.build_id = perf_event__process_build_id;
2519 trace->tool.namespaces = perf_event__process_namespaces;
2521 trace->tool.ordered_events = true;
2522 trace->tool.ordering_requires_timestamps = true;
2524 /* add tid to output */
2525 trace->multiple_threads = true;
2527 session = perf_session__new(&file, false, &trace->tool);
2528 if (session == NULL)
2531 if (trace->opts.target.pid)
2532 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2534 if (trace->opts.target.tid)
2535 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2537 if (symbol__init(&session->header.env) < 0)
2540 trace->host = &session->machines.host;
2542 err = perf_session__set_tracepoints_handlers(session, handlers);
2546 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2547 "raw_syscalls:sys_enter");
2548 /* older kernels have syscalls tp versus raw_syscalls */
2550 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2551 "syscalls:sys_enter");
2554 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2555 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2556 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2560 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2561 "raw_syscalls:sys_exit");
2563 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2564 "syscalls:sys_exit");
2566 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2567 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2568 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2572 evlist__for_each_entry(session->evlist, evsel) {
2573 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2574 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2575 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2576 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2577 evsel->handler = trace__pgfault;
2582 err = perf_session__process_events(session);
2584 pr_err("Failed to process events, error %d", err);
2586 else if (trace->summary)
2587 trace__fprintf_thread_summary(trace, trace->output);
2590 perf_session__delete(session);
2595 static size_t trace__fprintf_threads_header(FILE *fp)
2599 printed = fprintf(fp, "\n Summary of events:\n\n");
2604 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2605 struct stats *stats;
2610 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2611 struct stats *stats = source->priv;
2613 entry->syscall = source->i;
2614 entry->stats = stats;
2615 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2618 static size_t thread__dump_stats(struct thread_trace *ttrace,
2619 struct trace *trace, FILE *fp)
2624 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2626 if (syscall_stats == NULL)
2629 printed += fprintf(fp, "\n");
2631 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2632 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2633 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2635 resort_rb__for_each_entry(nd, syscall_stats) {
2636 struct stats *stats = syscall_stats_entry->stats;
2638 double min = (double)(stats->min) / NSEC_PER_MSEC;
2639 double max = (double)(stats->max) / NSEC_PER_MSEC;
2640 double avg = avg_stats(stats);
2642 u64 n = (u64) stats->n;
2644 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2645 avg /= NSEC_PER_MSEC;
2647 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2648 printed += fprintf(fp, " %-15s", sc->name);
2649 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2650 n, syscall_stats_entry->msecs, min, avg);
2651 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2655 resort_rb__delete(syscall_stats);
2656 printed += fprintf(fp, "\n\n");
2661 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2664 struct thread_trace *ttrace = thread__priv(thread);
2670 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2672 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2673 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2674 printed += fprintf(fp, "%.1f%%", ratio);
2676 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2678 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2680 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2681 else if (fputc('\n', fp) != EOF)
2684 printed += thread__dump_stats(ttrace, trace, fp);
2689 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2691 return ttrace ? ttrace->nr_events : 0;
2694 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2695 struct thread *thread;
2698 entry->thread = rb_entry(nd, struct thread, rb_node);
2701 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2703 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2704 size_t printed = trace__fprintf_threads_header(fp);
2707 if (threads == NULL) {
2708 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2712 resort_rb__for_each_entry(nd, threads)
2713 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2715 resort_rb__delete(threads);
2720 static int trace__set_duration(const struct option *opt, const char *str,
2721 int unset __maybe_unused)
2723 struct trace *trace = opt->value;
2725 trace->duration_filter = atof(str);
2729 static int trace__set_filter_pids(const struct option *opt, const char *str,
2730 int unset __maybe_unused)
2734 struct trace *trace = opt->value;
2736 * FIXME: introduce a intarray class, plain parse csv and create a
2737 * { int nr, int entries[] } struct...
2739 struct intlist *list = intlist__new(str);
2744 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2745 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2747 if (trace->filter_pids.entries == NULL)
2750 trace->filter_pids.entries[0] = getpid();
2752 for (i = 1; i < trace->filter_pids.nr; ++i)
2753 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2755 intlist__delete(list);
2761 static int trace__open_output(struct trace *trace, const char *filename)
2765 if (!stat(filename, &st) && st.st_size) {
2766 char oldname[PATH_MAX];
2768 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2770 rename(filename, oldname);
2773 trace->output = fopen(filename, "w");
2775 return trace->output == NULL ? -errno : 0;
2778 static int parse_pagefaults(const struct option *opt, const char *str,
2779 int unset __maybe_unused)
2781 int *trace_pgfaults = opt->value;
2783 if (strcmp(str, "all") == 0)
2784 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2785 else if (strcmp(str, "maj") == 0)
2786 *trace_pgfaults |= TRACE_PFMAJ;
2787 else if (strcmp(str, "min") == 0)
2788 *trace_pgfaults |= TRACE_PFMIN;
2795 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2797 struct perf_evsel *evsel;
2799 evlist__for_each_entry(evlist, evsel)
2800 evsel->handler = handler;
2804 * XXX: Hackish, just splitting the combined -e+--event (syscalls
2805 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2806 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2808 * It'd be better to introduce a parse_options() variant that would return a
2809 * list with the terms it didn't match to an event...
2811 static int trace__parse_events_option(const struct option *opt, const char *str,
2812 int unset __maybe_unused)
2814 struct trace *trace = (struct trace *)opt->value;
2815 const char *s = str;
2816 char *sep = NULL, *lists[2] = { NULL, NULL, };
2817 int len = strlen(str) + 1, err = -1, list;
2818 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2819 char group_name[PATH_MAX];
2821 if (strace_groups_dir == NULL)
2826 trace->not_ev_qualifier = true;
2830 if ((sep = strchr(s, ',')) != NULL)
2834 if (syscalltbl__id(trace->sctbl, s) >= 0) {
2837 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2838 if (access(group_name, R_OK) == 0)
2843 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2845 lists[list] = malloc(len);
2846 if (lists[list] == NULL)
2848 strcpy(lists[list], s);
2858 if (lists[1] != NULL) {
2859 struct strlist_config slist_config = {
2860 .dirname = strace_groups_dir,
2863 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2864 if (trace->ev_qualifier == NULL) {
2865 fputs("Not enough memory to parse event qualifier", trace->output);
2869 if (trace__validate_ev_qualifier(trace))
2876 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2877 "event selector. use 'perf list' to list available events",
2878 parse_events_option);
2879 err = parse_events_option(&o, lists[0], 0);
2888 int cmd_trace(int argc, const char **argv)
2890 const char *trace_usage[] = {
2891 "perf trace [<options>] [<command>]",
2892 "perf trace [<options>] -- <command> [<options>]",
2893 "perf trace record [<options>] [<command>]",
2894 "perf trace record [<options>] -- <command> [<options>]",
2897 struct trace trace = {
2906 .user_freq = UINT_MAX,
2907 .user_interval = ULLONG_MAX,
2908 .no_buffering = true,
2909 .mmap_pages = UINT_MAX,
2910 .proc_map_timeout = 500,
2914 .trace_syscalls = true,
2915 .kernel_syscallchains = false,
2916 .max_stack = UINT_MAX,
2918 const char *output_name = NULL;
2919 const struct option trace_options[] = {
2920 OPT_CALLBACK('e', "event", &trace, "event",
2921 "event/syscall selector. use 'perf list' to list available events",
2922 trace__parse_events_option),
2923 OPT_BOOLEAN(0, "comm", &trace.show_comm,
2924 "show the thread COMM next to its id"),
2925 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2926 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2927 trace__parse_events_option),
2928 OPT_STRING('o', "output", &output_name, "file", "output file name"),
2929 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2930 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2931 "trace events on existing process id"),
2932 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2933 "trace events on existing thread id"),
2934 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2935 "pids to filter (by the kernel)", trace__set_filter_pids),
2936 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2937 "system-wide collection from all CPUs"),
2938 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2939 "list of cpus to monitor"),
2940 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2941 "child tasks do not inherit counters"),
2942 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2943 "number of mmap data pages",
2944 perf_evlist__parse_mmap_pages),
2945 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2947 OPT_CALLBACK(0, "duration", &trace, "float",
2948 "show only events with duration > N.M ms",
2949 trace__set_duration),
2950 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2951 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2952 OPT_BOOLEAN('T', "time", &trace.full_time,
2953 "Show full timestamp, not time relative to first start"),
2954 OPT_BOOLEAN('s', "summary", &trace.summary_only,
2955 "Show only syscall summary with statistics"),
2956 OPT_BOOLEAN('S', "with-summary", &trace.summary,
2957 "Show all syscalls and summary with statistics"),
2958 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2959 "Trace pagefaults", parse_pagefaults, "maj"),
2960 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2961 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2962 OPT_CALLBACK(0, "call-graph", &trace.opts,
2963 "record_mode[,record_size]", record_callchain_help,
2964 &record_parse_callchain_opt),
2965 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2966 "Show the kernel callchains on the syscall exit path"),
2967 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2968 "Set the minimum stack depth when parsing the callchain, "
2969 "anything below the specified depth will be ignored."),
2970 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2971 "Set the maximum stack depth when parsing the callchain, "
2972 "anything beyond the specified depth will be ignored. "
2973 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2974 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2975 "per thread proc mmap processing timeout in ms"),
2976 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2977 "ms to wait before starting measurement after program "
2981 bool __maybe_unused max_stack_user_set = true;
2982 bool mmap_pages_user_set = true;
2983 const char * const trace_subcommands[] = { "record", NULL };
2987 signal(SIGSEGV, sighandler_dump_stack);
2988 signal(SIGFPE, sighandler_dump_stack);
2990 trace.evlist = perf_evlist__new();
2991 trace.sctbl = syscalltbl__new();
2993 if (trace.evlist == NULL || trace.sctbl == NULL) {
2994 pr_err("Not enough memory to run!\n");
2999 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3000 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3002 err = bpf__setup_stdout(trace.evlist);
3004 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3005 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3011 if (trace.trace_pgfaults) {
3012 trace.opts.sample_address = true;
3013 trace.opts.sample_time = true;
3016 if (trace.opts.mmap_pages == UINT_MAX)
3017 mmap_pages_user_set = false;
3019 if (trace.max_stack == UINT_MAX) {
3020 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3021 max_stack_user_set = false;
3024 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3025 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
3026 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3029 if (callchain_param.enabled) {
3030 if (!mmap_pages_user_set && geteuid() == 0)
3031 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3033 symbol_conf.use_callchain = true;
3036 if (trace.evlist->nr_entries > 0)
3037 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3039 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3040 return trace__record(&trace, argc-1, &argv[1]);
3042 /* summary_only implies summary option, but don't overwrite summary if set */
3043 if (trace.summary_only)
3044 trace.summary = trace.summary_only;
3046 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3047 trace.evlist->nr_entries == 0 /* Was --events used? */) {
3048 pr_err("Please specify something to trace.\n");
3052 if (!trace.trace_syscalls && trace.ev_qualifier) {
3053 pr_err("The -e option can't be used with --no-syscalls.\n");
3057 if (output_name != NULL) {
3058 err = trace__open_output(&trace, output_name);
3060 perror("failed to create output file");
3065 trace.open_id = syscalltbl__id(trace.sctbl, "open");
3067 err = target__validate(&trace.opts.target);
3069 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3070 fprintf(trace.output, "%s", bf);
3074 err = target__parse_uid(&trace.opts.target);
3076 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3077 fprintf(trace.output, "%s", bf);
3081 if (!argc && target__none(&trace.opts.target))
3082 trace.opts.target.system_wide = true;
3085 err = trace__replay(&trace);
3087 err = trace__run(&trace, argc, argv);
3090 if (output_name != NULL)
3091 fclose(trace.output);