perf trace: Associate vfs_getname()'ed pathname with fd returned from 'openat'
[linux-2.6-microblaze.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
25 #include "util/env.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
44 #include "string2.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
47
48 #include <errno.h>
49 #include <inttypes.h>
50 #include <poll.h>
51 #include <signal.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 #include <fcntl.h>
61
62 #include "sane_ctype.h"
63
64 #ifndef O_CLOEXEC
65 # define O_CLOEXEC              02000000
66 #endif
67
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE  1024
70 #endif
71
72 struct trace {
73         struct perf_tool        tool;
74         struct syscalltbl       *sctbl;
75         struct {
76                 int             max;
77                 struct syscall  *table;
78                 struct {
79                         struct perf_evsel *sys_enter,
80                                           *sys_exit;
81                 }               events;
82         } syscalls;
83         struct record_opts      opts;
84         struct perf_evlist      *evlist;
85         struct machine          *host;
86         struct thread           *current;
87         struct cgroup           *cgroup;
88         u64                     base_time;
89         FILE                    *output;
90         unsigned long           nr_events;
91         struct strlist          *ev_qualifier;
92         struct {
93                 size_t          nr;
94                 int             *entries;
95         }                       ev_qualifier_ids;
96         struct {
97                 size_t          nr;
98                 pid_t           *entries;
99         }                       filter_pids;
100         double                  duration_filter;
101         double                  runtime_ms;
102         struct {
103                 u64             vfs_getname,
104                                 proc_getname;
105         } stats;
106         unsigned int            max_stack;
107         unsigned int            min_stack;
108         bool                    not_ev_qualifier;
109         bool                    live;
110         bool                    full_time;
111         bool                    sched;
112         bool                    multiple_threads;
113         bool                    summary;
114         bool                    summary_only;
115         bool                    failure_only;
116         bool                    show_comm;
117         bool                    print_sample;
118         bool                    show_tool_stats;
119         bool                    trace_syscalls;
120         bool                    kernel_syscallchains;
121         bool                    force;
122         bool                    vfs_getname;
123         int                     trace_pgfaults;
124 };
125
126 struct tp_field {
127         int offset;
128         union {
129                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
130                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
131         };
132 };
133
134 #define TP_UINT_FIELD(bits) \
135 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
136 { \
137         u##bits value; \
138         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
139         return value;  \
140 }
141
142 TP_UINT_FIELD(8);
143 TP_UINT_FIELD(16);
144 TP_UINT_FIELD(32);
145 TP_UINT_FIELD(64);
146
147 #define TP_UINT_FIELD__SWAPPED(bits) \
148 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
149 { \
150         u##bits value; \
151         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
152         return bswap_##bits(value);\
153 }
154
155 TP_UINT_FIELD__SWAPPED(16);
156 TP_UINT_FIELD__SWAPPED(32);
157 TP_UINT_FIELD__SWAPPED(64);
158
159 static int tp_field__init_uint(struct tp_field *field,
160                                struct format_field *format_field,
161                                bool needs_swap)
162 {
163         field->offset = format_field->offset;
164
165         switch (format_field->size) {
166         case 1:
167                 field->integer = tp_field__u8;
168                 break;
169         case 2:
170                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
171                 break;
172         case 4:
173                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
174                 break;
175         case 8:
176                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
177                 break;
178         default:
179                 return -1;
180         }
181
182         return 0;
183 }
184
185 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
186 {
187         return sample->raw_data + field->offset;
188 }
189
190 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
191 {
192         field->offset = format_field->offset;
193         field->pointer = tp_field__ptr;
194         return 0;
195 }
196
197 struct syscall_tp {
198         struct tp_field id;
199         union {
200                 struct tp_field args, ret;
201         };
202 };
203
204 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
205                                           struct tp_field *field,
206                                           const char *name)
207 {
208         struct format_field *format_field = perf_evsel__field(evsel, name);
209
210         if (format_field == NULL)
211                 return -1;
212
213         return tp_field__init_uint(field, format_field, evsel->needs_swap);
214 }
215
216 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
217         ({ struct syscall_tp *sc = evsel->priv;\
218            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
219
220 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
221                                          struct tp_field *field,
222                                          const char *name)
223 {
224         struct format_field *format_field = perf_evsel__field(evsel, name);
225
226         if (format_field == NULL)
227                 return -1;
228
229         return tp_field__init_ptr(field, format_field);
230 }
231
232 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
233         ({ struct syscall_tp *sc = evsel->priv;\
234            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
235
236 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
237 {
238         zfree(&evsel->priv);
239         perf_evsel__delete(evsel);
240 }
241
242 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
243 {
244         evsel->priv = malloc(sizeof(struct syscall_tp));
245         if (evsel->priv != NULL) {
246                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
247                         goto out_delete;
248
249                 evsel->handler = handler;
250                 return 0;
251         }
252
253         return -ENOMEM;
254
255 out_delete:
256         zfree(&evsel->priv);
257         return -ENOENT;
258 }
259
260 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
261 {
262         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
263
264         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
265         if (IS_ERR(evsel))
266                 evsel = perf_evsel__newtp("syscalls", direction);
267
268         if (IS_ERR(evsel))
269                 return NULL;
270
271         if (perf_evsel__init_syscall_tp(evsel, handler))
272                 goto out_delete;
273
274         return evsel;
275
276 out_delete:
277         perf_evsel__delete_priv(evsel);
278         return NULL;
279 }
280
281 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
282         ({ struct syscall_tp *fields = evsel->priv; \
283            fields->name.integer(&fields->name, sample); })
284
285 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
286         ({ struct syscall_tp *fields = evsel->priv; \
287            fields->name.pointer(&fields->name, sample); })
288
289 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
290 {
291         int idx = val - sa->offset;
292
293         if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL)
294                 return scnprintf(bf, size, intfmt, val);
295
296         return scnprintf(bf, size, "%s", sa->entries[idx]);
297 }
298
299 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
300                                                 const char *intfmt,
301                                                 struct syscall_arg *arg)
302 {
303         return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
304 }
305
306 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
307                                               struct syscall_arg *arg)
308 {
309         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
310 }
311
312 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
313
314 struct strarrays {
315         int             nr_entries;
316         struct strarray **entries;
317 };
318
319 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
320         .nr_entries = ARRAY_SIZE(array), \
321         .entries = array, \
322 }
323
324 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
325                                         struct syscall_arg *arg)
326 {
327         struct strarrays *sas = arg->parm;
328         int i;
329
330         for (i = 0; i < sas->nr_entries; ++i) {
331                 struct strarray *sa = sas->entries[i];
332                 int idx = arg->val - sa->offset;
333
334                 if (idx >= 0 && idx < sa->nr_entries) {
335                         if (sa->entries[idx] == NULL)
336                                 break;
337                         return scnprintf(bf, size, "%s", sa->entries[idx]);
338                 }
339         }
340
341         return scnprintf(bf, size, "%d", arg->val);
342 }
343
344 #ifndef AT_FDCWD
345 #define AT_FDCWD        -100
346 #endif
347
348 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
349                                            struct syscall_arg *arg)
350 {
351         int fd = arg->val;
352
353         if (fd == AT_FDCWD)
354                 return scnprintf(bf, size, "CWD");
355
356         return syscall_arg__scnprintf_fd(bf, size, arg);
357 }
358
359 #define SCA_FDAT syscall_arg__scnprintf_fd_at
360
361 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
362                                               struct syscall_arg *arg);
363
364 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
365
366 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
367 {
368         return scnprintf(bf, size, "%#lx", arg->val);
369 }
370
371 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
372 {
373         return scnprintf(bf, size, "%d", arg->val);
374 }
375
376 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
377 {
378         return scnprintf(bf, size, "%ld", arg->val);
379 }
380
381 static const char *bpf_cmd[] = {
382         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
383         "MAP_GET_NEXT_KEY", "PROG_LOAD",
384 };
385 static DEFINE_STRARRAY(bpf_cmd);
386
387 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
388 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
389
390 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
391 static DEFINE_STRARRAY(itimers);
392
393 static const char *keyctl_options[] = {
394         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
395         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
396         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
397         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
398         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
399 };
400 static DEFINE_STRARRAY(keyctl_options);
401
402 static const char *whences[] = { "SET", "CUR", "END",
403 #ifdef SEEK_DATA
404 "DATA",
405 #endif
406 #ifdef SEEK_HOLE
407 "HOLE",
408 #endif
409 };
410 static DEFINE_STRARRAY(whences);
411
412 static const char *fcntl_cmds[] = {
413         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
414         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
415         "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
416         "GETOWNER_UIDS",
417 };
418 static DEFINE_STRARRAY(fcntl_cmds);
419
420 static const char *fcntl_linux_specific_cmds[] = {
421         "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
422         "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
423         "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
424 };
425
426 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
427
428 static struct strarray *fcntl_cmds_arrays[] = {
429         &strarray__fcntl_cmds,
430         &strarray__fcntl_linux_specific_cmds,
431 };
432
433 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
434
435 static const char *rlimit_resources[] = {
436         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
437         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
438         "RTTIME",
439 };
440 static DEFINE_STRARRAY(rlimit_resources);
441
442 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
443 static DEFINE_STRARRAY(sighow);
444
445 static const char *clockid[] = {
446         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
447         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
448         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
449 };
450 static DEFINE_STRARRAY(clockid);
451
452 static const char *socket_families[] = {
453         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
454         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
455         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
456         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
457         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
458         "ALG", "NFC", "VSOCK",
459 };
460 static DEFINE_STRARRAY(socket_families);
461
462 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
463                                                  struct syscall_arg *arg)
464 {
465         size_t printed = 0;
466         int mode = arg->val;
467
468         if (mode == F_OK) /* 0 */
469                 return scnprintf(bf, size, "F");
470 #define P_MODE(n) \
471         if (mode & n##_OK) { \
472                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
473                 mode &= ~n##_OK; \
474         }
475
476         P_MODE(R);
477         P_MODE(W);
478         P_MODE(X);
479 #undef P_MODE
480
481         if (mode)
482                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
483
484         return printed;
485 }
486
487 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
488
489 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
490                                               struct syscall_arg *arg);
491
492 #define SCA_FILENAME syscall_arg__scnprintf_filename
493
494 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
495                                                 struct syscall_arg *arg)
496 {
497         int printed = 0, flags = arg->val;
498
499 #define P_FLAG(n) \
500         if (flags & O_##n) { \
501                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
502                 flags &= ~O_##n; \
503         }
504
505         P_FLAG(CLOEXEC);
506         P_FLAG(NONBLOCK);
507 #undef P_FLAG
508
509         if (flags)
510                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
511
512         return printed;
513 }
514
515 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
516
517 #ifndef GRND_NONBLOCK
518 #define GRND_NONBLOCK   0x0001
519 #endif
520 #ifndef GRND_RANDOM
521 #define GRND_RANDOM     0x0002
522 #endif
523
524 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
525                                                    struct syscall_arg *arg)
526 {
527         int printed = 0, flags = arg->val;
528
529 #define P_FLAG(n) \
530         if (flags & GRND_##n) { \
531                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
532                 flags &= ~GRND_##n; \
533         }
534
535         P_FLAG(RANDOM);
536         P_FLAG(NONBLOCK);
537 #undef P_FLAG
538
539         if (flags)
540                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
541
542         return printed;
543 }
544
545 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
546
547 #define STRARRAY(name, array) \
548           { .scnprintf  = SCA_STRARRAY, \
549             .parm       = &strarray__##array, }
550
551 #include "trace/beauty/arch_errno_names.c"
552 #include "trace/beauty/eventfd.c"
553 #include "trace/beauty/futex_op.c"
554 #include "trace/beauty/futex_val3.c"
555 #include "trace/beauty/mmap.c"
556 #include "trace/beauty/mode_t.c"
557 #include "trace/beauty/msg_flags.c"
558 #include "trace/beauty/open_flags.c"
559 #include "trace/beauty/perf_event_open.c"
560 #include "trace/beauty/pid.c"
561 #include "trace/beauty/sched_policy.c"
562 #include "trace/beauty/seccomp.c"
563 #include "trace/beauty/signum.c"
564 #include "trace/beauty/socket_type.c"
565 #include "trace/beauty/waitid_options.c"
566
567 struct syscall_arg_fmt {
568         size_t     (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
569         void       *parm;
570         const char *name;
571         bool       show_zero;
572 };
573
574 static struct syscall_fmt {
575         const char *name;
576         const char *alias;
577         struct syscall_arg_fmt arg[6];
578         u8         nr_args;
579         bool       errpid;
580         bool       timeout;
581         bool       hexret;
582 } syscall_fmts[] = {
583         { .name     = "access",
584           .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
585         { .name     = "bpf",
586           .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
587         { .name     = "brk",        .hexret = true,
588           .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
589         { .name     = "clock_gettime",
590           .arg = { [0] = STRARRAY(clk_id, clockid), }, },
591         { .name     = "clone",      .errpid = true, .nr_args = 5,
592           .arg = { [0] = { .name = "flags",         .scnprintf = SCA_CLONE_FLAGS, },
593                    [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
594                    [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
595                    [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
596                    [4] = { .name = "tls",           .scnprintf = SCA_HEX, }, }, },
597         { .name     = "close",
598           .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
599         { .name     = "epoll_ctl",
600           .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
601         { .name     = "eventfd2",
602           .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
603         { .name     = "fchmodat",
604           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
605         { .name     = "fchownat",
606           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
607         { .name     = "fcntl",
608           .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
609                            .parm      = &strarrays__fcntl_cmds_arrays,
610                            .show_zero = true, },
611                    [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
612         { .name     = "flock",
613           .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
614         { .name     = "fstat", .alias = "newfstat", },
615         { .name     = "fstatat", .alias = "newfstatat", },
616         { .name     = "futex",
617           .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
618                    [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
619         { .name     = "futimesat",
620           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
621         { .name     = "getitimer",
622           .arg = { [0] = STRARRAY(which, itimers), }, },
623         { .name     = "getpid",     .errpid = true, },
624         { .name     = "getpgid",    .errpid = true, },
625         { .name     = "getppid",    .errpid = true, },
626         { .name     = "getrandom",
627           .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
628         { .name     = "getrlimit",
629           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
630         { .name     = "gettid",     .errpid = true, },
631         { .name     = "ioctl",
632           .arg = {
633 #if defined(__i386__) || defined(__x86_64__)
634 /*
635  * FIXME: Make this available to all arches.
636  */
637                    [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
638                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
639 #else
640                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
641 #endif
642         { .name     = "kcmp",       .nr_args = 5,
643           .arg = { [0] = { .name = "pid1",      .scnprintf = SCA_PID, },
644                    [1] = { .name = "pid2",      .scnprintf = SCA_PID, },
645                    [2] = { .name = "type",      .scnprintf = SCA_KCMP_TYPE, },
646                    [3] = { .name = "idx1",      .scnprintf = SCA_KCMP_IDX, },
647                    [4] = { .name = "idx2",      .scnprintf = SCA_KCMP_IDX, }, }, },
648         { .name     = "keyctl",
649           .arg = { [0] = STRARRAY(option, keyctl_options), }, },
650         { .name     = "kill",
651           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
652         { .name     = "linkat",
653           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
654         { .name     = "lseek",
655           .arg = { [2] = STRARRAY(whence, whences), }, },
656         { .name     = "lstat", .alias = "newlstat", },
657         { .name     = "madvise",
658           .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
659                    [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
660         { .name     = "mkdirat",
661           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
662         { .name     = "mknodat",
663           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
664         { .name     = "mlock",
665           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
666         { .name     = "mlockall",
667           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
668         { .name     = "mmap",       .hexret = true,
669 /* The standard mmap maps to old_mmap on s390x */
670 #if defined(__s390x__)
671         .alias = "old_mmap",
672 #endif
673           .arg = { [0] = { .scnprintf = SCA_HEX,        /* addr */ },
674                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
675                    [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
676         { .name     = "mprotect",
677           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
678                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ }, }, },
679         { .name     = "mq_unlink",
680           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
681         { .name     = "mremap",     .hexret = true,
682           .arg = { [0] = { .scnprintf = SCA_HEX,          /* addr */ },
683                    [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
684                    [4] = { .scnprintf = SCA_HEX,          /* new_addr */ }, }, },
685         { .name     = "munlock",
686           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
687         { .name     = "munmap",
688           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
689         { .name     = "name_to_handle_at",
690           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
691         { .name     = "newfstatat",
692           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
693         { .name     = "open",
694           .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
695         { .name     = "open_by_handle_at",
696           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
697                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
698         { .name     = "openat",
699           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
700                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
701         { .name     = "perf_event_open",
702           .arg = { [2] = { .scnprintf = SCA_INT,        /* cpu */ },
703                    [3] = { .scnprintf = SCA_FD,         /* group_fd */ },
704                    [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
705         { .name     = "pipe2",
706           .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
707         { .name     = "pkey_alloc",
708           .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,   /* access_rights */ }, }, },
709         { .name     = "pkey_free",
710           .arg = { [0] = { .scnprintf = SCA_INT,        /* key */ }, }, },
711         { .name     = "pkey_mprotect",
712           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
713                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
714                    [3] = { .scnprintf = SCA_INT,        /* pkey */ }, }, },
715         { .name     = "poll", .timeout = true, },
716         { .name     = "ppoll", .timeout = true, },
717         { .name     = "prctl", .alias = "arch_prctl",
718           .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
719                    [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
720                    [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
721         { .name     = "pread", .alias = "pread64", },
722         { .name     = "preadv", .alias = "pread", },
723         { .name     = "prlimit64",
724           .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
725         { .name     = "pwrite", .alias = "pwrite64", },
726         { .name     = "readlinkat",
727           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
728         { .name     = "recvfrom",
729           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
730         { .name     = "recvmmsg",
731           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
732         { .name     = "recvmsg",
733           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
734         { .name     = "renameat",
735           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
736         { .name     = "rt_sigaction",
737           .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
738         { .name     = "rt_sigprocmask",
739           .arg = { [0] = STRARRAY(how, sighow), }, },
740         { .name     = "rt_sigqueueinfo",
741           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
742         { .name     = "rt_tgsigqueueinfo",
743           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
744         { .name     = "sched_setscheduler",
745           .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
746         { .name     = "seccomp",
747           .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,    /* op */ },
748                    [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
749         { .name     = "select", .timeout = true, },
750         { .name     = "sendmmsg",
751           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
752         { .name     = "sendmsg",
753           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
754         { .name     = "sendto",
755           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
756         { .name     = "set_tid_address", .errpid = true, },
757         { .name     = "setitimer",
758           .arg = { [0] = STRARRAY(which, itimers), }, },
759         { .name     = "setrlimit",
760           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
761         { .name     = "socket",
762           .arg = { [0] = STRARRAY(family, socket_families),
763                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
764                    [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
765         { .name     = "socketpair",
766           .arg = { [0] = STRARRAY(family, socket_families),
767                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
768                    [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
769         { .name     = "stat", .alias = "newstat", },
770         { .name     = "statx",
771           .arg = { [0] = { .scnprintf = SCA_FDAT,        /* fdat */ },
772                    [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
773                    [3] = { .scnprintf = SCA_STATX_MASK,  /* mask */ }, }, },
774         { .name     = "swapoff",
775           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
776         { .name     = "swapon",
777           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
778         { .name     = "symlinkat",
779           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
780         { .name     = "tgkill",
781           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
782         { .name     = "tkill",
783           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
784         { .name     = "uname", .alias = "newuname", },
785         { .name     = "unlinkat",
786           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
787         { .name     = "utimensat",
788           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
789         { .name     = "wait4",      .errpid = true,
790           .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
791         { .name     = "waitid",     .errpid = true,
792           .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
793 };
794
795 static int syscall_fmt__cmp(const void *name, const void *fmtp)
796 {
797         const struct syscall_fmt *fmt = fmtp;
798         return strcmp(name, fmt->name);
799 }
800
801 static struct syscall_fmt *syscall_fmt__find(const char *name)
802 {
803         const int nmemb = ARRAY_SIZE(syscall_fmts);
804         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
805 }
806
807 /*
808  * is_exit: is this "exit" or "exit_group"?
809  * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
810  */
811 struct syscall {
812         struct event_format *tp_format;
813         int                 nr_args;
814         bool                is_exit;
815         bool                is_open;
816         struct format_field *args;
817         const char          *name;
818         struct syscall_fmt  *fmt;
819         struct syscall_arg_fmt *arg_fmt;
820 };
821
822 /*
823  * We need to have this 'calculated' boolean because in some cases we really
824  * don't know what is the duration of a syscall, for instance, when we start
825  * a session and some threads are waiting for a syscall to finish, say 'poll',
826  * in which case all we can do is to print "( ? ) for duration and for the
827  * start timestamp.
828  */
829 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
830 {
831         double duration = (double)t / NSEC_PER_MSEC;
832         size_t printed = fprintf(fp, "(");
833
834         if (!calculated)
835                 printed += fprintf(fp, "         ");
836         else if (duration >= 1.0)
837                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
838         else if (duration >= 0.01)
839                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
840         else
841                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
842         return printed + fprintf(fp, "): ");
843 }
844
845 /**
846  * filename.ptr: The filename char pointer that will be vfs_getname'd
847  * filename.entry_str_pos: Where to insert the string translated from
848  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
849  * ret_scnprintf: syscall args may set this to a different syscall return
850  *                formatter, for instance, fcntl may return fds, file flags, etc.
851  */
852 struct thread_trace {
853         u64               entry_time;
854         bool              entry_pending;
855         unsigned long     nr_events;
856         unsigned long     pfmaj, pfmin;
857         char              *entry_str;
858         double            runtime_ms;
859         size_t            (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
860         struct {
861                 unsigned long ptr;
862                 short int     entry_str_pos;
863                 bool          pending_open;
864                 unsigned int  namelen;
865                 char          *name;
866         } filename;
867         struct {
868                 int       max;
869                 char      **table;
870         } paths;
871
872         struct intlist *syscall_stats;
873 };
874
875 static struct thread_trace *thread_trace__new(void)
876 {
877         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
878
879         if (ttrace)
880                 ttrace->paths.max = -1;
881
882         ttrace->syscall_stats = intlist__new(NULL);
883
884         return ttrace;
885 }
886
887 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
888 {
889         struct thread_trace *ttrace;
890
891         if (thread == NULL)
892                 goto fail;
893
894         if (thread__priv(thread) == NULL)
895                 thread__set_priv(thread, thread_trace__new());
896
897         if (thread__priv(thread) == NULL)
898                 goto fail;
899
900         ttrace = thread__priv(thread);
901         ++ttrace->nr_events;
902
903         return ttrace;
904 fail:
905         color_fprintf(fp, PERF_COLOR_RED,
906                       "WARNING: not enough memory, dropping samples!\n");
907         return NULL;
908 }
909
910
911 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
912                                     size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
913 {
914         struct thread_trace *ttrace = thread__priv(arg->thread);
915
916         ttrace->ret_scnprintf = ret_scnprintf;
917 }
918
919 #define TRACE_PFMAJ             (1 << 0)
920 #define TRACE_PFMIN             (1 << 1)
921
922 static const size_t trace__entry_str_size = 2048;
923
924 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
925 {
926         struct thread_trace *ttrace = thread__priv(thread);
927
928         if (fd > ttrace->paths.max) {
929                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
930
931                 if (npath == NULL)
932                         return -1;
933
934                 if (ttrace->paths.max != -1) {
935                         memset(npath + ttrace->paths.max + 1, 0,
936                                (fd - ttrace->paths.max) * sizeof(char *));
937                 } else {
938                         memset(npath, 0, (fd + 1) * sizeof(char *));
939                 }
940
941                 ttrace->paths.table = npath;
942                 ttrace->paths.max   = fd;
943         }
944
945         ttrace->paths.table[fd] = strdup(pathname);
946
947         return ttrace->paths.table[fd] != NULL ? 0 : -1;
948 }
949
950 static int thread__read_fd_path(struct thread *thread, int fd)
951 {
952         char linkname[PATH_MAX], pathname[PATH_MAX];
953         struct stat st;
954         int ret;
955
956         if (thread->pid_ == thread->tid) {
957                 scnprintf(linkname, sizeof(linkname),
958                           "/proc/%d/fd/%d", thread->pid_, fd);
959         } else {
960                 scnprintf(linkname, sizeof(linkname),
961                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
962         }
963
964         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
965                 return -1;
966
967         ret = readlink(linkname, pathname, sizeof(pathname));
968
969         if (ret < 0 || ret > st.st_size)
970                 return -1;
971
972         pathname[ret] = '\0';
973         return trace__set_fd_pathname(thread, fd, pathname);
974 }
975
976 static const char *thread__fd_path(struct thread *thread, int fd,
977                                    struct trace *trace)
978 {
979         struct thread_trace *ttrace = thread__priv(thread);
980
981         if (ttrace == NULL)
982                 return NULL;
983
984         if (fd < 0)
985                 return NULL;
986
987         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
988                 if (!trace->live)
989                         return NULL;
990                 ++trace->stats.proc_getname;
991                 if (thread__read_fd_path(thread, fd))
992                         return NULL;
993         }
994
995         return ttrace->paths.table[fd];
996 }
997
998 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
999 {
1000         int fd = arg->val;
1001         size_t printed = scnprintf(bf, size, "%d", fd);
1002         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1003
1004         if (path)
1005                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1006
1007         return printed;
1008 }
1009
1010 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1011 {
1012         size_t printed = scnprintf(bf, size, "%d", fd);
1013         struct thread *thread = machine__find_thread(trace->host, pid, pid);
1014
1015         if (thread) {
1016                 const char *path = thread__fd_path(thread, fd, trace);
1017
1018                 if (path)
1019                         printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1020
1021                 thread__put(thread);
1022         }
1023
1024         return printed;
1025 }
1026
1027 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1028                                               struct syscall_arg *arg)
1029 {
1030         int fd = arg->val;
1031         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1032         struct thread_trace *ttrace = thread__priv(arg->thread);
1033
1034         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1035                 zfree(&ttrace->paths.table[fd]);
1036
1037         return printed;
1038 }
1039
1040 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1041                                      unsigned long ptr)
1042 {
1043         struct thread_trace *ttrace = thread__priv(thread);
1044
1045         ttrace->filename.ptr = ptr;
1046         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1047 }
1048
1049 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1050                                               struct syscall_arg *arg)
1051 {
1052         unsigned long ptr = arg->val;
1053
1054         if (!arg->trace->vfs_getname)
1055                 return scnprintf(bf, size, "%#x", ptr);
1056
1057         thread__set_filename_pos(arg->thread, bf, ptr);
1058         return 0;
1059 }
1060
1061 static bool trace__filter_duration(struct trace *trace, double t)
1062 {
1063         return t < (trace->duration_filter * NSEC_PER_MSEC);
1064 }
1065
1066 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1067 {
1068         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1069
1070         return fprintf(fp, "%10.3f ", ts);
1071 }
1072
1073 /*
1074  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1075  * using ttrace->entry_time for a thread that receives a sys_exit without
1076  * first having received a sys_enter ("poll" issued before tracing session
1077  * starts, lost sys_enter exit due to ring buffer overflow).
1078  */
1079 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1080 {
1081         if (tstamp > 0)
1082                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1083
1084         return fprintf(fp, "         ? ");
1085 }
1086
1087 static bool done = false;
1088 static bool interrupted = false;
1089
1090 static void sig_handler(int sig)
1091 {
1092         done = true;
1093         interrupted = sig == SIGINT;
1094 }
1095
1096 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1097                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1098 {
1099         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1100         printed += fprintf_duration(duration, duration_calculated, fp);
1101
1102         if (trace->multiple_threads) {
1103                 if (trace->show_comm)
1104                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1105                 printed += fprintf(fp, "%d ", thread->tid);
1106         }
1107
1108         return printed;
1109 }
1110
1111 static int trace__process_event(struct trace *trace, struct machine *machine,
1112                                 union perf_event *event, struct perf_sample *sample)
1113 {
1114         int ret = 0;
1115
1116         switch (event->header.type) {
1117         case PERF_RECORD_LOST:
1118                 color_fprintf(trace->output, PERF_COLOR_RED,
1119                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1120                 ret = machine__process_lost_event(machine, event, sample);
1121                 break;
1122         default:
1123                 ret = machine__process_event(machine, event, sample);
1124                 break;
1125         }
1126
1127         return ret;
1128 }
1129
1130 static int trace__tool_process(struct perf_tool *tool,
1131                                union perf_event *event,
1132                                struct perf_sample *sample,
1133                                struct machine *machine)
1134 {
1135         struct trace *trace = container_of(tool, struct trace, tool);
1136         return trace__process_event(trace, machine, event, sample);
1137 }
1138
1139 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1140 {
1141         struct machine *machine = vmachine;
1142
1143         if (machine->kptr_restrict_warned)
1144                 return NULL;
1145
1146         if (symbol_conf.kptr_restrict) {
1147                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1148                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1149                            "Kernel samples will not be resolved.\n");
1150                 machine->kptr_restrict_warned = true;
1151                 return NULL;
1152         }
1153
1154         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1155 }
1156
1157 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1158 {
1159         int err = symbol__init(NULL);
1160
1161         if (err)
1162                 return err;
1163
1164         trace->host = machine__new_host();
1165         if (trace->host == NULL)
1166                 return -ENOMEM;
1167
1168         err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1169         if (err < 0)
1170                 goto out;
1171
1172         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1173                                             evlist->threads, trace__tool_process, false,
1174                                             trace->opts.proc_map_timeout, 1);
1175 out:
1176         if (err)
1177                 symbol__exit();
1178
1179         return err;
1180 }
1181
1182 static void trace__symbols__exit(struct trace *trace)
1183 {
1184         machine__exit(trace->host);
1185         trace->host = NULL;
1186
1187         symbol__exit();
1188 }
1189
1190 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1191 {
1192         int idx;
1193
1194         if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1195                 nr_args = sc->fmt->nr_args;
1196
1197         sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1198         if (sc->arg_fmt == NULL)
1199                 return -1;
1200
1201         for (idx = 0; idx < nr_args; ++idx) {
1202                 if (sc->fmt)
1203                         sc->arg_fmt[idx] = sc->fmt->arg[idx];
1204         }
1205
1206         sc->nr_args = nr_args;
1207         return 0;
1208 }
1209
1210 static int syscall__set_arg_fmts(struct syscall *sc)
1211 {
1212         struct format_field *field;
1213         int idx = 0, len;
1214
1215         for (field = sc->args; field; field = field->next, ++idx) {
1216                 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1217                         continue;
1218
1219                 if (strcmp(field->type, "const char *") == 0 &&
1220                          (strcmp(field->name, "filename") == 0 ||
1221                           strcmp(field->name, "path") == 0 ||
1222                           strcmp(field->name, "pathname") == 0))
1223                         sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1224                 else if (field->flags & FIELD_IS_POINTER)
1225                         sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1226                 else if (strcmp(field->type, "pid_t") == 0)
1227                         sc->arg_fmt[idx].scnprintf = SCA_PID;
1228                 else if (strcmp(field->type, "umode_t") == 0)
1229                         sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1230                 else if ((strcmp(field->type, "int") == 0 ||
1231                           strcmp(field->type, "unsigned int") == 0 ||
1232                           strcmp(field->type, "long") == 0) &&
1233                          (len = strlen(field->name)) >= 2 &&
1234                          strcmp(field->name + len - 2, "fd") == 0) {
1235                         /*
1236                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1237                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1238                          * 65 int
1239                          * 23 unsigned int
1240                          * 7 unsigned long
1241                          */
1242                         sc->arg_fmt[idx].scnprintf = SCA_FD;
1243                 }
1244         }
1245
1246         return 0;
1247 }
1248
1249 static int trace__read_syscall_info(struct trace *trace, int id)
1250 {
1251         char tp_name[128];
1252         struct syscall *sc;
1253         const char *name = syscalltbl__name(trace->sctbl, id);
1254
1255         if (name == NULL)
1256                 return -1;
1257
1258         if (id > trace->syscalls.max) {
1259                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1260
1261                 if (nsyscalls == NULL)
1262                         return -1;
1263
1264                 if (trace->syscalls.max != -1) {
1265                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1266                                (id - trace->syscalls.max) * sizeof(*sc));
1267                 } else {
1268                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1269                 }
1270
1271                 trace->syscalls.table = nsyscalls;
1272                 trace->syscalls.max   = id;
1273         }
1274
1275         sc = trace->syscalls.table + id;
1276         sc->name = name;
1277
1278         sc->fmt  = syscall_fmt__find(sc->name);
1279
1280         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1281         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1282
1283         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1284                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1285                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1286         }
1287
1288         if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1289                 return -1;
1290
1291         if (IS_ERR(sc->tp_format))
1292                 return -1;
1293
1294         sc->args = sc->tp_format->format.fields;
1295         /*
1296          * We need to check and discard the first variable '__syscall_nr'
1297          * or 'nr' that mean the syscall number. It is needless here.
1298          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1299          */
1300         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1301                 sc->args = sc->args->next;
1302                 --sc->nr_args;
1303         }
1304
1305         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1306         sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1307
1308         return syscall__set_arg_fmts(sc);
1309 }
1310
1311 static int trace__validate_ev_qualifier(struct trace *trace)
1312 {
1313         int err = 0, i;
1314         size_t nr_allocated;
1315         struct str_node *pos;
1316
1317         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1318         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1319                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1320
1321         if (trace->ev_qualifier_ids.entries == NULL) {
1322                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1323                        trace->output);
1324                 err = -EINVAL;
1325                 goto out;
1326         }
1327
1328         nr_allocated = trace->ev_qualifier_ids.nr;
1329         i = 0;
1330
1331         strlist__for_each_entry(pos, trace->ev_qualifier) {
1332                 const char *sc = pos->s;
1333                 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1334
1335                 if (id < 0) {
1336                         id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1337                         if (id >= 0)
1338                                 goto matches;
1339
1340                         if (err == 0) {
1341                                 fputs("Error:\tInvalid syscall ", trace->output);
1342                                 err = -EINVAL;
1343                         } else {
1344                                 fputs(", ", trace->output);
1345                         }
1346
1347                         fputs(sc, trace->output);
1348                 }
1349 matches:
1350                 trace->ev_qualifier_ids.entries[i++] = id;
1351                 if (match_next == -1)
1352                         continue;
1353
1354                 while (1) {
1355                         id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1356                         if (id < 0)
1357                                 break;
1358                         if (nr_allocated == trace->ev_qualifier_ids.nr) {
1359                                 void *entries;
1360
1361                                 nr_allocated += 8;
1362                                 entries = realloc(trace->ev_qualifier_ids.entries,
1363                                                   nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1364                                 if (entries == NULL) {
1365                                         err = -ENOMEM;
1366                                         fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1367                                         goto out_free;
1368                                 }
1369                                 trace->ev_qualifier_ids.entries = entries;
1370                         }
1371                         trace->ev_qualifier_ids.nr++;
1372                         trace->ev_qualifier_ids.entries[i++] = id;
1373                 }
1374         }
1375
1376         if (err < 0) {
1377                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1378                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1379 out_free:
1380                 zfree(&trace->ev_qualifier_ids.entries);
1381                 trace->ev_qualifier_ids.nr = 0;
1382         }
1383 out:
1384         return err;
1385 }
1386
1387 /*
1388  * args is to be interpreted as a series of longs but we need to handle
1389  * 8-byte unaligned accesses. args points to raw_data within the event
1390  * and raw_data is guaranteed to be 8-byte unaligned because it is
1391  * preceded by raw_size which is a u32. So we need to copy args to a temp
1392  * variable to read it. Most notably this avoids extended load instructions
1393  * on unaligned addresses
1394  */
1395 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1396 {
1397         unsigned long val;
1398         unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1399
1400         memcpy(&val, p, sizeof(val));
1401         return val;
1402 }
1403
1404 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1405                                       struct syscall_arg *arg)
1406 {
1407         if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1408                 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1409
1410         return scnprintf(bf, size, "arg%d: ", arg->idx);
1411 }
1412
1413 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1414                                      struct syscall_arg *arg, unsigned long val)
1415 {
1416         if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1417                 arg->val = val;
1418                 if (sc->arg_fmt[arg->idx].parm)
1419                         arg->parm = sc->arg_fmt[arg->idx].parm;
1420                 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1421         }
1422         return scnprintf(bf, size, "%ld", val);
1423 }
1424
1425 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1426                                       unsigned char *args, struct trace *trace,
1427                                       struct thread *thread)
1428 {
1429         size_t printed = 0;
1430         unsigned long val;
1431         u8 bit = 1;
1432         struct syscall_arg arg = {
1433                 .args   = args,
1434                 .idx    = 0,
1435                 .mask   = 0,
1436                 .trace  = trace,
1437                 .thread = thread,
1438         };
1439         struct thread_trace *ttrace = thread__priv(thread);
1440
1441         /*
1442          * Things like fcntl will set this in its 'cmd' formatter to pick the
1443          * right formatter for the return value (an fd? file flags?), which is
1444          * not needed for syscalls that always return a given type, say an fd.
1445          */
1446         ttrace->ret_scnprintf = NULL;
1447
1448         if (sc->args != NULL) {
1449                 struct format_field *field;
1450
1451                 for (field = sc->args; field;
1452                      field = field->next, ++arg.idx, bit <<= 1) {
1453                         if (arg.mask & bit)
1454                                 continue;
1455
1456                         val = syscall_arg__val(&arg, arg.idx);
1457
1458                         /*
1459                          * Suppress this argument if its value is zero and
1460                          * and we don't have a string associated in an
1461                          * strarray for it.
1462                          */
1463                         if (val == 0 &&
1464                             !(sc->arg_fmt &&
1465                               (sc->arg_fmt[arg.idx].show_zero ||
1466                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1467                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1468                               sc->arg_fmt[arg.idx].parm))
1469                                 continue;
1470
1471                         printed += scnprintf(bf + printed, size - printed,
1472                                              "%s%s: ", printed ? ", " : "", field->name);
1473                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1474                 }
1475         } else if (IS_ERR(sc->tp_format)) {
1476                 /*
1477                  * If we managed to read the tracepoint /format file, then we
1478                  * may end up not having any args, like with gettid(), so only
1479                  * print the raw args when we didn't manage to read it.
1480                  */
1481                 while (arg.idx < sc->nr_args) {
1482                         if (arg.mask & bit)
1483                                 goto next_arg;
1484                         val = syscall_arg__val(&arg, arg.idx);
1485                         if (printed)
1486                                 printed += scnprintf(bf + printed, size - printed, ", ");
1487                         printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1488                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1489 next_arg:
1490                         ++arg.idx;
1491                         bit <<= 1;
1492                 }
1493         }
1494
1495         return printed;
1496 }
1497
1498 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1499                                   union perf_event *event,
1500                                   struct perf_sample *sample);
1501
1502 static struct syscall *trace__syscall_info(struct trace *trace,
1503                                            struct perf_evsel *evsel, int id)
1504 {
1505
1506         if (id < 0) {
1507
1508                 /*
1509                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1510                  * before that, leaving at a higher verbosity level till that is
1511                  * explained. Reproduced with plain ftrace with:
1512                  *
1513                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1514                  * grep "NR -1 " /t/trace_pipe
1515                  *
1516                  * After generating some load on the machine.
1517                  */
1518                 if (verbose > 1) {
1519                         static u64 n;
1520                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1521                                 id, perf_evsel__name(evsel), ++n);
1522                 }
1523                 return NULL;
1524         }
1525
1526         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1527             trace__read_syscall_info(trace, id))
1528                 goto out_cant_read;
1529
1530         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1531                 goto out_cant_read;
1532
1533         return &trace->syscalls.table[id];
1534
1535 out_cant_read:
1536         if (verbose > 0) {
1537                 fprintf(trace->output, "Problems reading syscall %d", id);
1538                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1539                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1540                 fputs(" information\n", trace->output);
1541         }
1542         return NULL;
1543 }
1544
1545 static void thread__update_stats(struct thread_trace *ttrace,
1546                                  int id, struct perf_sample *sample)
1547 {
1548         struct int_node *inode;
1549         struct stats *stats;
1550         u64 duration = 0;
1551
1552         inode = intlist__findnew(ttrace->syscall_stats, id);
1553         if (inode == NULL)
1554                 return;
1555
1556         stats = inode->priv;
1557         if (stats == NULL) {
1558                 stats = malloc(sizeof(struct stats));
1559                 if (stats == NULL)
1560                         return;
1561                 init_stats(stats);
1562                 inode->priv = stats;
1563         }
1564
1565         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1566                 duration = sample->time - ttrace->entry_time;
1567
1568         update_stats(stats, duration);
1569 }
1570
1571 static int trace__printf_interrupted_entry(struct trace *trace)
1572 {
1573         struct thread_trace *ttrace;
1574         size_t printed;
1575
1576         if (trace->failure_only || trace->current == NULL)
1577                 return 0;
1578
1579         ttrace = thread__priv(trace->current);
1580
1581         if (!ttrace->entry_pending)
1582                 return 0;
1583
1584         printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1585         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1586         ttrace->entry_pending = false;
1587
1588         return printed;
1589 }
1590
1591 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1592                                  struct perf_sample *sample, struct thread *thread)
1593 {
1594         int printed = 0;
1595
1596         if (trace->print_sample) {
1597                 double ts = (double)sample->time / NSEC_PER_MSEC;
1598
1599                 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1600                                    perf_evsel__name(evsel), ts,
1601                                    thread__comm_str(thread),
1602                                    sample->pid, sample->tid, sample->cpu);
1603         }
1604
1605         return printed;
1606 }
1607
1608 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1609                             union perf_event *event __maybe_unused,
1610                             struct perf_sample *sample)
1611 {
1612         char *msg;
1613         void *args;
1614         size_t printed = 0;
1615         struct thread *thread;
1616         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1617         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1618         struct thread_trace *ttrace;
1619
1620         if (sc == NULL)
1621                 return -1;
1622
1623         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1624         ttrace = thread__trace(thread, trace->output);
1625         if (ttrace == NULL)
1626                 goto out_put;
1627
1628         trace__fprintf_sample(trace, evsel, sample, thread);
1629
1630         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1631
1632         if (ttrace->entry_str == NULL) {
1633                 ttrace->entry_str = malloc(trace__entry_str_size);
1634                 if (!ttrace->entry_str)
1635                         goto out_put;
1636         }
1637
1638         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1639                 trace__printf_interrupted_entry(trace);
1640
1641         ttrace->entry_time = sample->time;
1642         msg = ttrace->entry_str;
1643         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1644
1645         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1646                                            args, trace, thread);
1647
1648         if (sc->is_exit) {
1649                 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1650                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1651                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1652                 }
1653         } else {
1654                 ttrace->entry_pending = true;
1655                 /* See trace__vfs_getname & trace__sys_exit */
1656                 ttrace->filename.pending_open = false;
1657         }
1658
1659         if (trace->current != thread) {
1660                 thread__put(trace->current);
1661                 trace->current = thread__get(thread);
1662         }
1663         err = 0;
1664 out_put:
1665         thread__put(thread);
1666         return err;
1667 }
1668
1669 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1670                                     struct perf_sample *sample,
1671                                     struct callchain_cursor *cursor)
1672 {
1673         struct addr_location al;
1674         int max_stack = evsel->attr.sample_max_stack ?
1675                         evsel->attr.sample_max_stack :
1676                         trace->max_stack;
1677
1678         if (machine__resolve(trace->host, &al, sample) < 0 ||
1679             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1680                 return -1;
1681
1682         return 0;
1683 }
1684
1685 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1686 {
1687         /* TODO: user-configurable print_opts */
1688         const unsigned int print_opts = EVSEL__PRINT_SYM |
1689                                         EVSEL__PRINT_DSO |
1690                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1691
1692         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1693 }
1694
1695 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1696 {
1697         struct perf_env *env = perf_evsel__env(evsel);
1698         const char *arch_name = perf_env__arch(env);
1699
1700         return arch_syscalls__strerrno(arch_name, err);
1701 }
1702
1703 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1704                            union perf_event *event __maybe_unused,
1705                            struct perf_sample *sample)
1706 {
1707         long ret;
1708         u64 duration = 0;
1709         bool duration_calculated = false;
1710         struct thread *thread;
1711         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1712         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1713         struct thread_trace *ttrace;
1714
1715         if (sc == NULL)
1716                 return -1;
1717
1718         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1719         ttrace = thread__trace(thread, trace->output);
1720         if (ttrace == NULL)
1721                 goto out_put;
1722
1723         trace__fprintf_sample(trace, evsel, sample, thread);
1724
1725         if (trace->summary)
1726                 thread__update_stats(ttrace, id, sample);
1727
1728         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1729
1730         if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
1731                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1732                 ttrace->filename.pending_open = false;
1733                 ++trace->stats.vfs_getname;
1734         }
1735
1736         if (ttrace->entry_time) {
1737                 duration = sample->time - ttrace->entry_time;
1738                 if (trace__filter_duration(trace, duration))
1739                         goto out;
1740                 duration_calculated = true;
1741         } else if (trace->duration_filter)
1742                 goto out;
1743
1744         if (sample->callchain) {
1745                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1746                 if (callchain_ret == 0) {
1747                         if (callchain_cursor.nr < trace->min_stack)
1748                                 goto out;
1749                         callchain_ret = 1;
1750                 }
1751         }
1752
1753         if (trace->summary_only || (ret >= 0 && trace->failure_only))
1754                 goto out;
1755
1756         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1757
1758         if (ttrace->entry_pending) {
1759                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1760         } else {
1761                 fprintf(trace->output, " ... [");
1762                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1763                 fprintf(trace->output, "]: %s()", sc->name);
1764         }
1765
1766         if (sc->fmt == NULL) {
1767                 if (ret < 0)
1768                         goto errno_print;
1769 signed_print:
1770                 fprintf(trace->output, ") = %ld", ret);
1771         } else if (ret < 0) {
1772 errno_print: {
1773                 char bf[STRERR_BUFSIZE];
1774                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1775                            *e = errno_to_name(evsel, -ret);
1776
1777                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1778         }
1779         } else if (ret == 0 && sc->fmt->timeout)
1780                 fprintf(trace->output, ") = 0 Timeout");
1781         else if (ttrace->ret_scnprintf) {
1782                 char bf[1024];
1783                 struct syscall_arg arg = {
1784                         .val    = ret,
1785                         .thread = thread,
1786                         .trace  = trace,
1787                 };
1788                 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1789                 ttrace->ret_scnprintf = NULL;
1790                 fprintf(trace->output, ") = %s", bf);
1791         } else if (sc->fmt->hexret)
1792                 fprintf(trace->output, ") = %#lx", ret);
1793         else if (sc->fmt->errpid) {
1794                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1795
1796                 if (child != NULL) {
1797                         fprintf(trace->output, ") = %ld", ret);
1798                         if (child->comm_set)
1799                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1800                         thread__put(child);
1801                 }
1802         } else
1803                 goto signed_print;
1804
1805         fputc('\n', trace->output);
1806
1807         if (callchain_ret > 0)
1808                 trace__fprintf_callchain(trace, sample);
1809         else if (callchain_ret < 0)
1810                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1811 out:
1812         ttrace->entry_pending = false;
1813         err = 0;
1814 out_put:
1815         thread__put(thread);
1816         return err;
1817 }
1818
1819 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1820                               union perf_event *event __maybe_unused,
1821                               struct perf_sample *sample)
1822 {
1823         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1824         struct thread_trace *ttrace;
1825         size_t filename_len, entry_str_len, to_move;
1826         ssize_t remaining_space;
1827         char *pos;
1828         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1829
1830         if (!thread)
1831                 goto out;
1832
1833         ttrace = thread__priv(thread);
1834         if (!ttrace)
1835                 goto out_put;
1836
1837         filename_len = strlen(filename);
1838         if (filename_len == 0)
1839                 goto out_put;
1840
1841         if (ttrace->filename.namelen < filename_len) {
1842                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1843
1844                 if (f == NULL)
1845                         goto out_put;
1846
1847                 ttrace->filename.namelen = filename_len;
1848                 ttrace->filename.name = f;
1849         }
1850
1851         strcpy(ttrace->filename.name, filename);
1852         ttrace->filename.pending_open = true;
1853
1854         if (!ttrace->filename.ptr)
1855                 goto out_put;
1856
1857         entry_str_len = strlen(ttrace->entry_str);
1858         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1859         if (remaining_space <= 0)
1860                 goto out_put;
1861
1862         if (filename_len > (size_t)remaining_space) {
1863                 filename += filename_len - remaining_space;
1864                 filename_len = remaining_space;
1865         }
1866
1867         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1868         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1869         memmove(pos + filename_len, pos, to_move);
1870         memcpy(pos, filename, filename_len);
1871
1872         ttrace->filename.ptr = 0;
1873         ttrace->filename.entry_str_pos = 0;
1874 out_put:
1875         thread__put(thread);
1876 out:
1877         return 0;
1878 }
1879
1880 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1881                                      union perf_event *event __maybe_unused,
1882                                      struct perf_sample *sample)
1883 {
1884         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1885         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1886         struct thread *thread = machine__findnew_thread(trace->host,
1887                                                         sample->pid,
1888                                                         sample->tid);
1889         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1890
1891         if (ttrace == NULL)
1892                 goto out_dump;
1893
1894         ttrace->runtime_ms += runtime_ms;
1895         trace->runtime_ms += runtime_ms;
1896 out_put:
1897         thread__put(thread);
1898         return 0;
1899
1900 out_dump:
1901         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1902                evsel->name,
1903                perf_evsel__strval(evsel, sample, "comm"),
1904                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1905                runtime,
1906                perf_evsel__intval(evsel, sample, "vruntime"));
1907         goto out_put;
1908 }
1909
1910 static int bpf_output__printer(enum binary_printer_ops op,
1911                                unsigned int val, void *extra __maybe_unused, FILE *fp)
1912 {
1913         unsigned char ch = (unsigned char)val;
1914
1915         switch (op) {
1916         case BINARY_PRINT_CHAR_DATA:
1917                 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1918         case BINARY_PRINT_DATA_BEGIN:
1919         case BINARY_PRINT_LINE_BEGIN:
1920         case BINARY_PRINT_ADDR:
1921         case BINARY_PRINT_NUM_DATA:
1922         case BINARY_PRINT_NUM_PAD:
1923         case BINARY_PRINT_SEP:
1924         case BINARY_PRINT_CHAR_PAD:
1925         case BINARY_PRINT_LINE_END:
1926         case BINARY_PRINT_DATA_END:
1927         default:
1928                 break;
1929         }
1930
1931         return 0;
1932 }
1933
1934 static void bpf_output__fprintf(struct trace *trace,
1935                                 struct perf_sample *sample)
1936 {
1937         binary__fprintf(sample->raw_data, sample->raw_size, 8,
1938                         bpf_output__printer, NULL, trace->output);
1939 }
1940
1941 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1942                                 union perf_event *event __maybe_unused,
1943                                 struct perf_sample *sample)
1944 {
1945         int callchain_ret = 0;
1946
1947         if (sample->callchain) {
1948                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1949                 if (callchain_ret == 0) {
1950                         if (callchain_cursor.nr < trace->min_stack)
1951                                 goto out;
1952                         callchain_ret = 1;
1953                 }
1954         }
1955
1956         trace__printf_interrupted_entry(trace);
1957         trace__fprintf_tstamp(trace, sample->time, trace->output);
1958
1959         if (trace->trace_syscalls)
1960                 fprintf(trace->output, "(         ): ");
1961
1962         fprintf(trace->output, "%s:", evsel->name);
1963
1964         if (perf_evsel__is_bpf_output(evsel)) {
1965                 bpf_output__fprintf(trace, sample);
1966         } else if (evsel->tp_format) {
1967                 event_format__fprintf(evsel->tp_format, sample->cpu,
1968                                       sample->raw_data, sample->raw_size,
1969                                       trace->output);
1970         }
1971
1972         fprintf(trace->output, "\n");
1973
1974         if (callchain_ret > 0)
1975                 trace__fprintf_callchain(trace, sample);
1976         else if (callchain_ret < 0)
1977                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1978 out:
1979         return 0;
1980 }
1981
1982 static void print_location(FILE *f, struct perf_sample *sample,
1983                            struct addr_location *al,
1984                            bool print_dso, bool print_sym)
1985 {
1986
1987         if ((verbose > 0 || print_dso) && al->map)
1988                 fprintf(f, "%s@", al->map->dso->long_name);
1989
1990         if ((verbose > 0 || print_sym) && al->sym)
1991                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1992                         al->addr - al->sym->start);
1993         else if (al->map)
1994                 fprintf(f, "0x%" PRIx64, al->addr);
1995         else
1996                 fprintf(f, "0x%" PRIx64, sample->addr);
1997 }
1998
1999 static int trace__pgfault(struct trace *trace,
2000                           struct perf_evsel *evsel,
2001                           union perf_event *event __maybe_unused,
2002                           struct perf_sample *sample)
2003 {
2004         struct thread *thread;
2005         struct addr_location al;
2006         char map_type = 'd';
2007         struct thread_trace *ttrace;
2008         int err = -1;
2009         int callchain_ret = 0;
2010
2011         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2012
2013         if (sample->callchain) {
2014                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2015                 if (callchain_ret == 0) {
2016                         if (callchain_cursor.nr < trace->min_stack)
2017                                 goto out_put;
2018                         callchain_ret = 1;
2019                 }
2020         }
2021
2022         ttrace = thread__trace(thread, trace->output);
2023         if (ttrace == NULL)
2024                 goto out_put;
2025
2026         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2027                 ttrace->pfmaj++;
2028         else
2029                 ttrace->pfmin++;
2030
2031         if (trace->summary_only)
2032                 goto out;
2033
2034         thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2035
2036         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2037
2038         fprintf(trace->output, "%sfault [",
2039                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2040                 "maj" : "min");
2041
2042         print_location(trace->output, sample, &al, false, true);
2043
2044         fprintf(trace->output, "] => ");
2045
2046         thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2047
2048         if (!al.map) {
2049                 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2050
2051                 if (al.map)
2052                         map_type = 'x';
2053                 else
2054                         map_type = '?';
2055         }
2056
2057         print_location(trace->output, sample, &al, true, false);
2058
2059         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2060
2061         if (callchain_ret > 0)
2062                 trace__fprintf_callchain(trace, sample);
2063         else if (callchain_ret < 0)
2064                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2065 out:
2066         err = 0;
2067 out_put:
2068         thread__put(thread);
2069         return err;
2070 }
2071
2072 static void trace__set_base_time(struct trace *trace,
2073                                  struct perf_evsel *evsel,
2074                                  struct perf_sample *sample)
2075 {
2076         /*
2077          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2078          * and don't use sample->time unconditionally, we may end up having
2079          * some other event in the future without PERF_SAMPLE_TIME for good
2080          * reason, i.e. we may not be interested in its timestamps, just in
2081          * it taking place, picking some piece of information when it
2082          * appears in our event stream (vfs_getname comes to mind).
2083          */
2084         if (trace->base_time == 0 && !trace->full_time &&
2085             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2086                 trace->base_time = sample->time;
2087 }
2088
2089 static int trace__process_sample(struct perf_tool *tool,
2090                                  union perf_event *event,
2091                                  struct perf_sample *sample,
2092                                  struct perf_evsel *evsel,
2093                                  struct machine *machine __maybe_unused)
2094 {
2095         struct trace *trace = container_of(tool, struct trace, tool);
2096         struct thread *thread;
2097         int err = 0;
2098
2099         tracepoint_handler handler = evsel->handler;
2100
2101         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2102         if (thread && thread__is_filtered(thread))
2103                 goto out;
2104
2105         trace__set_base_time(trace, evsel, sample);
2106
2107         if (handler) {
2108                 ++trace->nr_events;
2109                 handler(trace, evsel, event, sample);
2110         }
2111 out:
2112         thread__put(thread);
2113         return err;
2114 }
2115
2116 static int trace__record(struct trace *trace, int argc, const char **argv)
2117 {
2118         unsigned int rec_argc, i, j;
2119         const char **rec_argv;
2120         const char * const record_args[] = {
2121                 "record",
2122                 "-R",
2123                 "-m", "1024",
2124                 "-c", "1",
2125         };
2126
2127         const char * const sc_args[] = { "-e", };
2128         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2129         const char * const majpf_args[] = { "-e", "major-faults" };
2130         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2131         const char * const minpf_args[] = { "-e", "minor-faults" };
2132         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2133
2134         /* +1 is for the event string below */
2135         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2136                 majpf_args_nr + minpf_args_nr + argc;
2137         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2138
2139         if (rec_argv == NULL)
2140                 return -ENOMEM;
2141
2142         j = 0;
2143         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2144                 rec_argv[j++] = record_args[i];
2145
2146         if (trace->trace_syscalls) {
2147                 for (i = 0; i < sc_args_nr; i++)
2148                         rec_argv[j++] = sc_args[i];
2149
2150                 /* event string may be different for older kernels - e.g., RHEL6 */
2151                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2152                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2153                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2154                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2155                 else {
2156                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2157                         free(rec_argv);
2158                         return -1;
2159                 }
2160         }
2161
2162         if (trace->trace_pgfaults & TRACE_PFMAJ)
2163                 for (i = 0; i < majpf_args_nr; i++)
2164                         rec_argv[j++] = majpf_args[i];
2165
2166         if (trace->trace_pgfaults & TRACE_PFMIN)
2167                 for (i = 0; i < minpf_args_nr; i++)
2168                         rec_argv[j++] = minpf_args[i];
2169
2170         for (i = 0; i < (unsigned int)argc; i++)
2171                 rec_argv[j++] = argv[i];
2172
2173         return cmd_record(j, rec_argv);
2174 }
2175
2176 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2177
2178 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2179 {
2180         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2181
2182         if (IS_ERR(evsel))
2183                 return false;
2184
2185         if (perf_evsel__field(evsel, "pathname") == NULL) {
2186                 perf_evsel__delete(evsel);
2187                 return false;
2188         }
2189
2190         evsel->handler = trace__vfs_getname;
2191         perf_evlist__add(evlist, evsel);
2192         return true;
2193 }
2194
2195 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2196 {
2197         struct perf_evsel *evsel;
2198         struct perf_event_attr attr = {
2199                 .type = PERF_TYPE_SOFTWARE,
2200                 .mmap_data = 1,
2201         };
2202
2203         attr.config = config;
2204         attr.sample_period = 1;
2205
2206         event_attr_init(&attr);
2207
2208         evsel = perf_evsel__new(&attr);
2209         if (evsel)
2210                 evsel->handler = trace__pgfault;
2211
2212         return evsel;
2213 }
2214
2215 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2216 {
2217         const u32 type = event->header.type;
2218         struct perf_evsel *evsel;
2219
2220         if (type != PERF_RECORD_SAMPLE) {
2221                 trace__process_event(trace, trace->host, event, sample);
2222                 return;
2223         }
2224
2225         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2226         if (evsel == NULL) {
2227                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2228                 return;
2229         }
2230
2231         trace__set_base_time(trace, evsel, sample);
2232
2233         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2234             sample->raw_data == NULL) {
2235                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2236                        perf_evsel__name(evsel), sample->tid,
2237                        sample->cpu, sample->raw_size);
2238         } else {
2239                 tracepoint_handler handler = evsel->handler;
2240                 handler(trace, evsel, event, sample);
2241         }
2242 }
2243
2244 static int trace__add_syscall_newtp(struct trace *trace)
2245 {
2246         int ret = -1;
2247         struct perf_evlist *evlist = trace->evlist;
2248         struct perf_evsel *sys_enter, *sys_exit;
2249
2250         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2251         if (sys_enter == NULL)
2252                 goto out;
2253
2254         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2255                 goto out_delete_sys_enter;
2256
2257         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2258         if (sys_exit == NULL)
2259                 goto out_delete_sys_enter;
2260
2261         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2262                 goto out_delete_sys_exit;
2263
2264         perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2265         perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2266
2267         perf_evlist__add(evlist, sys_enter);
2268         perf_evlist__add(evlist, sys_exit);
2269
2270         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2271                 /*
2272                  * We're interested only in the user space callchain
2273                  * leading to the syscall, allow overriding that for
2274                  * debugging reasons using --kernel_syscall_callchains
2275                  */
2276                 sys_exit->attr.exclude_callchain_kernel = 1;
2277         }
2278
2279         trace->syscalls.events.sys_enter = sys_enter;
2280         trace->syscalls.events.sys_exit  = sys_exit;
2281
2282         ret = 0;
2283 out:
2284         return ret;
2285
2286 out_delete_sys_exit:
2287         perf_evsel__delete_priv(sys_exit);
2288 out_delete_sys_enter:
2289         perf_evsel__delete_priv(sys_enter);
2290         goto out;
2291 }
2292
2293 static int trace__set_ev_qualifier_filter(struct trace *trace)
2294 {
2295         int err = -1;
2296         struct perf_evsel *sys_exit;
2297         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2298                                                 trace->ev_qualifier_ids.nr,
2299                                                 trace->ev_qualifier_ids.entries);
2300
2301         if (filter == NULL)
2302                 goto out_enomem;
2303
2304         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2305                                           filter)) {
2306                 sys_exit = trace->syscalls.events.sys_exit;
2307                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2308         }
2309
2310         free(filter);
2311 out:
2312         return err;
2313 out_enomem:
2314         errno = ENOMEM;
2315         goto out;
2316 }
2317
2318 static int trace__set_filter_loop_pids(struct trace *trace)
2319 {
2320         unsigned int nr = 1;
2321         pid_t pids[32] = {
2322                 getpid(),
2323         };
2324         struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2325
2326         while (thread && nr < ARRAY_SIZE(pids)) {
2327                 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2328
2329                 if (parent == NULL)
2330                         break;
2331
2332                 if (!strcmp(thread__comm_str(parent), "sshd")) {
2333                         pids[nr++] = parent->tid;
2334                         break;
2335                 }
2336                 thread = parent;
2337         }
2338
2339         return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2340 }
2341
2342 static int trace__run(struct trace *trace, int argc, const char **argv)
2343 {
2344         struct perf_evlist *evlist = trace->evlist;
2345         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2346         int err = -1, i;
2347         unsigned long before;
2348         const bool forks = argc > 0;
2349         bool draining = false;
2350
2351         trace->live = true;
2352
2353         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2354                 goto out_error_raw_syscalls;
2355
2356         if (trace->trace_syscalls)
2357                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2358
2359         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2360                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2361                 if (pgfault_maj == NULL)
2362                         goto out_error_mem;
2363                 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2364                 perf_evlist__add(evlist, pgfault_maj);
2365         }
2366
2367         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2368                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2369                 if (pgfault_min == NULL)
2370                         goto out_error_mem;
2371                 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2372                 perf_evlist__add(evlist, pgfault_min);
2373         }
2374
2375         if (trace->sched &&
2376             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2377                                    trace__sched_stat_runtime))
2378                 goto out_error_sched_stat_runtime;
2379
2380         /*
2381          * If a global cgroup was set, apply it to all the events without an
2382          * explicit cgroup. I.e.:
2383          *
2384          *      trace -G A -e sched:*switch
2385          *
2386          * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2387          * _and_ sched:sched_switch to the 'A' cgroup, while:
2388          *
2389          * trace -e sched:*switch -G A
2390          *
2391          * will only set the sched:sched_switch event to the 'A' cgroup, all the
2392          * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2393          * a cgroup (on the root cgroup, sys wide, etc).
2394          *
2395          * Multiple cgroups:
2396          *
2397          * trace -G A -e sched:*switch -G B
2398          *
2399          * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2400          * to the 'B' cgroup.
2401          *
2402          * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2403          * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2404          */
2405         if (trace->cgroup)
2406                 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2407
2408         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2409         if (err < 0) {
2410                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2411                 goto out_delete_evlist;
2412         }
2413
2414         err = trace__symbols_init(trace, evlist);
2415         if (err < 0) {
2416                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2417                 goto out_delete_evlist;
2418         }
2419
2420         perf_evlist__config(evlist, &trace->opts, &callchain_param);
2421
2422         signal(SIGCHLD, sig_handler);
2423         signal(SIGINT, sig_handler);
2424
2425         if (forks) {
2426                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2427                                                     argv, false, NULL);
2428                 if (err < 0) {
2429                         fprintf(trace->output, "Couldn't run the workload!\n");
2430                         goto out_delete_evlist;
2431                 }
2432         }
2433
2434         err = perf_evlist__open(evlist);
2435         if (err < 0)
2436                 goto out_error_open;
2437
2438         err = bpf__apply_obj_config();
2439         if (err) {
2440                 char errbuf[BUFSIZ];
2441
2442                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2443                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2444                          errbuf);
2445                 goto out_error_open;
2446         }
2447
2448         /*
2449          * Better not use !target__has_task() here because we need to cover the
2450          * case where no threads were specified in the command line, but a
2451          * workload was, and in that case we will fill in the thread_map when
2452          * we fork the workload in perf_evlist__prepare_workload.
2453          */
2454         if (trace->filter_pids.nr > 0)
2455                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2456         else if (thread_map__pid(evlist->threads, 0) == -1)
2457                 err = trace__set_filter_loop_pids(trace);
2458
2459         if (err < 0)
2460                 goto out_error_mem;
2461
2462         if (trace->ev_qualifier_ids.nr > 0) {
2463                 err = trace__set_ev_qualifier_filter(trace);
2464                 if (err < 0)
2465                         goto out_errno;
2466
2467                 pr_debug("event qualifier tracepoint filter: %s\n",
2468                          trace->syscalls.events.sys_exit->filter);
2469         }
2470
2471         err = perf_evlist__apply_filters(evlist, &evsel);
2472         if (err < 0)
2473                 goto out_error_apply_filters;
2474
2475         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2476         if (err < 0)
2477                 goto out_error_mmap;
2478
2479         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2480                 perf_evlist__enable(evlist);
2481
2482         if (forks)
2483                 perf_evlist__start_workload(evlist);
2484
2485         if (trace->opts.initial_delay) {
2486                 usleep(trace->opts.initial_delay * 1000);
2487                 perf_evlist__enable(evlist);
2488         }
2489
2490         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2491                                   evlist->threads->nr > 1 ||
2492                                   perf_evlist__first(evlist)->attr.inherit;
2493
2494         /*
2495          * Now that we already used evsel->attr to ask the kernel to setup the
2496          * events, lets reuse evsel->attr.sample_max_stack as the limit in
2497          * trace__resolve_callchain(), allowing per-event max-stack settings
2498          * to override an explicitely set --max-stack global setting.
2499          */
2500         evlist__for_each_entry(evlist, evsel) {
2501                 if (evsel__has_callchain(evsel) &&
2502                     evsel->attr.sample_max_stack == 0)
2503                         evsel->attr.sample_max_stack = trace->max_stack;
2504         }
2505 again:
2506         before = trace->nr_events;
2507
2508         for (i = 0; i < evlist->nr_mmaps; i++) {
2509                 union perf_event *event;
2510                 struct perf_mmap *md;
2511
2512                 md = &evlist->mmap[i];
2513                 if (perf_mmap__read_init(md) < 0)
2514                         continue;
2515
2516                 while ((event = perf_mmap__read_event(md)) != NULL) {
2517                         struct perf_sample sample;
2518
2519                         ++trace->nr_events;
2520
2521                         err = perf_evlist__parse_sample(evlist, event, &sample);
2522                         if (err) {
2523                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2524                                 goto next_event;
2525                         }
2526
2527                         trace__handle_event(trace, event, &sample);
2528 next_event:
2529                         perf_mmap__consume(md);
2530
2531                         if (interrupted)
2532                                 goto out_disable;
2533
2534                         if (done && !draining) {
2535                                 perf_evlist__disable(evlist);
2536                                 draining = true;
2537                         }
2538                 }
2539                 perf_mmap__read_done(md);
2540         }
2541
2542         if (trace->nr_events == before) {
2543                 int timeout = done ? 100 : -1;
2544
2545                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2546                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2547                                 draining = true;
2548
2549                         goto again;
2550                 }
2551         } else {
2552                 goto again;
2553         }
2554
2555 out_disable:
2556         thread__zput(trace->current);
2557
2558         perf_evlist__disable(evlist);
2559
2560         if (!err) {
2561                 if (trace->summary)
2562                         trace__fprintf_thread_summary(trace, trace->output);
2563
2564                 if (trace->show_tool_stats) {
2565                         fprintf(trace->output, "Stats:\n "
2566                                                " vfs_getname : %" PRIu64 "\n"
2567                                                " proc_getname: %" PRIu64 "\n",
2568                                 trace->stats.vfs_getname,
2569                                 trace->stats.proc_getname);
2570                 }
2571         }
2572
2573 out_delete_evlist:
2574         trace__symbols__exit(trace);
2575
2576         perf_evlist__delete(evlist);
2577         cgroup__put(trace->cgroup);
2578         trace->evlist = NULL;
2579         trace->live = false;
2580         return err;
2581 {
2582         char errbuf[BUFSIZ];
2583
2584 out_error_sched_stat_runtime:
2585         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2586         goto out_error;
2587
2588 out_error_raw_syscalls:
2589         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2590         goto out_error;
2591
2592 out_error_mmap:
2593         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2594         goto out_error;
2595
2596 out_error_open:
2597         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2598
2599 out_error:
2600         fprintf(trace->output, "%s\n", errbuf);
2601         goto out_delete_evlist;
2602
2603 out_error_apply_filters:
2604         fprintf(trace->output,
2605                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2606                 evsel->filter, perf_evsel__name(evsel), errno,
2607                 str_error_r(errno, errbuf, sizeof(errbuf)));
2608         goto out_delete_evlist;
2609 }
2610 out_error_mem:
2611         fprintf(trace->output, "Not enough memory to run!\n");
2612         goto out_delete_evlist;
2613
2614 out_errno:
2615         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2616         goto out_delete_evlist;
2617 }
2618
2619 static int trace__replay(struct trace *trace)
2620 {
2621         const struct perf_evsel_str_handler handlers[] = {
2622                 { "probe:vfs_getname",       trace__vfs_getname, },
2623         };
2624         struct perf_data data = {
2625                 .file      = {
2626                         .path = input_name,
2627                 },
2628                 .mode      = PERF_DATA_MODE_READ,
2629                 .force     = trace->force,
2630         };
2631         struct perf_session *session;
2632         struct perf_evsel *evsel;
2633         int err = -1;
2634
2635         trace->tool.sample        = trace__process_sample;
2636         trace->tool.mmap          = perf_event__process_mmap;
2637         trace->tool.mmap2         = perf_event__process_mmap2;
2638         trace->tool.comm          = perf_event__process_comm;
2639         trace->tool.exit          = perf_event__process_exit;
2640         trace->tool.fork          = perf_event__process_fork;
2641         trace->tool.attr          = perf_event__process_attr;
2642         trace->tool.tracing_data  = perf_event__process_tracing_data;
2643         trace->tool.build_id      = perf_event__process_build_id;
2644         trace->tool.namespaces    = perf_event__process_namespaces;
2645
2646         trace->tool.ordered_events = true;
2647         trace->tool.ordering_requires_timestamps = true;
2648
2649         /* add tid to output */
2650         trace->multiple_threads = true;
2651
2652         session = perf_session__new(&data, false, &trace->tool);
2653         if (session == NULL)
2654                 return -1;
2655
2656         if (trace->opts.target.pid)
2657                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2658
2659         if (trace->opts.target.tid)
2660                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2661
2662         if (symbol__init(&session->header.env) < 0)
2663                 goto out;
2664
2665         trace->host = &session->machines.host;
2666
2667         err = perf_session__set_tracepoints_handlers(session, handlers);
2668         if (err)
2669                 goto out;
2670
2671         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2672                                                      "raw_syscalls:sys_enter");
2673         /* older kernels have syscalls tp versus raw_syscalls */
2674         if (evsel == NULL)
2675                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2676                                                              "syscalls:sys_enter");
2677
2678         if (evsel &&
2679             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2680             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2681                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2682                 goto out;
2683         }
2684
2685         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2686                                                      "raw_syscalls:sys_exit");
2687         if (evsel == NULL)
2688                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2689                                                              "syscalls:sys_exit");
2690         if (evsel &&
2691             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2692             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2693                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2694                 goto out;
2695         }
2696
2697         evlist__for_each_entry(session->evlist, evsel) {
2698                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2699                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2700                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2701                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2702                         evsel->handler = trace__pgfault;
2703         }
2704
2705         setup_pager();
2706
2707         err = perf_session__process_events(session);
2708         if (err)
2709                 pr_err("Failed to process events, error %d", err);
2710
2711         else if (trace->summary)
2712                 trace__fprintf_thread_summary(trace, trace->output);
2713
2714 out:
2715         perf_session__delete(session);
2716
2717         return err;
2718 }
2719
2720 static size_t trace__fprintf_threads_header(FILE *fp)
2721 {
2722         size_t printed;
2723
2724         printed  = fprintf(fp, "\n Summary of events:\n\n");
2725
2726         return printed;
2727 }
2728
2729 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2730         struct stats    *stats;
2731         double          msecs;
2732         int             syscall;
2733 )
2734 {
2735         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2736         struct stats *stats = source->priv;
2737
2738         entry->syscall = source->i;
2739         entry->stats   = stats;
2740         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2741 }
2742
2743 static size_t thread__dump_stats(struct thread_trace *ttrace,
2744                                  struct trace *trace, FILE *fp)
2745 {
2746         size_t printed = 0;
2747         struct syscall *sc;
2748         struct rb_node *nd;
2749         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2750
2751         if (syscall_stats == NULL)
2752                 return 0;
2753
2754         printed += fprintf(fp, "\n");
2755
2756         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2757         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2758         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2759
2760         resort_rb__for_each_entry(nd, syscall_stats) {
2761                 struct stats *stats = syscall_stats_entry->stats;
2762                 if (stats) {
2763                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2764                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2765                         double avg = avg_stats(stats);
2766                         double pct;
2767                         u64 n = (u64) stats->n;
2768
2769                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2770                         avg /= NSEC_PER_MSEC;
2771
2772                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2773                         printed += fprintf(fp, "   %-15s", sc->name);
2774                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2775                                            n, syscall_stats_entry->msecs, min, avg);
2776                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2777                 }
2778         }
2779
2780         resort_rb__delete(syscall_stats);
2781         printed += fprintf(fp, "\n\n");
2782
2783         return printed;
2784 }
2785
2786 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2787 {
2788         size_t printed = 0;
2789         struct thread_trace *ttrace = thread__priv(thread);
2790         double ratio;
2791
2792         if (ttrace == NULL)
2793                 return 0;
2794
2795         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2796
2797         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2798         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2799         printed += fprintf(fp, "%.1f%%", ratio);
2800         if (ttrace->pfmaj)
2801                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2802         if (ttrace->pfmin)
2803                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2804         if (trace->sched)
2805                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2806         else if (fputc('\n', fp) != EOF)
2807                 ++printed;
2808
2809         printed += thread__dump_stats(ttrace, trace, fp);
2810
2811         return printed;
2812 }
2813
2814 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2815 {
2816         return ttrace ? ttrace->nr_events : 0;
2817 }
2818
2819 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2820         struct thread *thread;
2821 )
2822 {
2823         entry->thread = rb_entry(nd, struct thread, rb_node);
2824 }
2825
2826 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2827 {
2828         size_t printed = trace__fprintf_threads_header(fp);
2829         struct rb_node *nd;
2830         int i;
2831
2832         for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2833                 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2834
2835                 if (threads == NULL) {
2836                         fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2837                         return 0;
2838                 }
2839
2840                 resort_rb__for_each_entry(nd, threads)
2841                         printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2842
2843                 resort_rb__delete(threads);
2844         }
2845         return printed;
2846 }
2847
2848 static int trace__set_duration(const struct option *opt, const char *str,
2849                                int unset __maybe_unused)
2850 {
2851         struct trace *trace = opt->value;
2852
2853         trace->duration_filter = atof(str);
2854         return 0;
2855 }
2856
2857 static int trace__set_filter_pids(const struct option *opt, const char *str,
2858                                   int unset __maybe_unused)
2859 {
2860         int ret = -1;
2861         size_t i;
2862         struct trace *trace = opt->value;
2863         /*
2864          * FIXME: introduce a intarray class, plain parse csv and create a
2865          * { int nr, int entries[] } struct...
2866          */
2867         struct intlist *list = intlist__new(str);
2868
2869         if (list == NULL)
2870                 return -1;
2871
2872         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2873         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2874
2875         if (trace->filter_pids.entries == NULL)
2876                 goto out;
2877
2878         trace->filter_pids.entries[0] = getpid();
2879
2880         for (i = 1; i < trace->filter_pids.nr; ++i)
2881                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2882
2883         intlist__delete(list);
2884         ret = 0;
2885 out:
2886         return ret;
2887 }
2888
2889 static int trace__open_output(struct trace *trace, const char *filename)
2890 {
2891         struct stat st;
2892
2893         if (!stat(filename, &st) && st.st_size) {
2894                 char oldname[PATH_MAX];
2895
2896                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2897                 unlink(oldname);
2898                 rename(filename, oldname);
2899         }
2900
2901         trace->output = fopen(filename, "w");
2902
2903         return trace->output == NULL ? -errno : 0;
2904 }
2905
2906 static int parse_pagefaults(const struct option *opt, const char *str,
2907                             int unset __maybe_unused)
2908 {
2909         int *trace_pgfaults = opt->value;
2910
2911         if (strcmp(str, "all") == 0)
2912                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2913         else if (strcmp(str, "maj") == 0)
2914                 *trace_pgfaults |= TRACE_PFMAJ;
2915         else if (strcmp(str, "min") == 0)
2916                 *trace_pgfaults |= TRACE_PFMIN;
2917         else
2918                 return -1;
2919
2920         return 0;
2921 }
2922
2923 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2924 {
2925         struct perf_evsel *evsel;
2926
2927         evlist__for_each_entry(evlist, evsel)
2928                 evsel->handler = handler;
2929 }
2930
2931 /*
2932  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2933  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2934  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2935  *
2936  * It'd be better to introduce a parse_options() variant that would return a
2937  * list with the terms it didn't match to an event...
2938  */
2939 static int trace__parse_events_option(const struct option *opt, const char *str,
2940                                       int unset __maybe_unused)
2941 {
2942         struct trace *trace = (struct trace *)opt->value;
2943         const char *s = str;
2944         char *sep = NULL, *lists[2] = { NULL, NULL, };
2945         int len = strlen(str) + 1, err = -1, list, idx;
2946         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2947         char group_name[PATH_MAX];
2948
2949         if (strace_groups_dir == NULL)
2950                 return -1;
2951
2952         if (*s == '!') {
2953                 ++s;
2954                 trace->not_ev_qualifier = true;
2955         }
2956
2957         while (1) {
2958                 if ((sep = strchr(s, ',')) != NULL)
2959                         *sep = '\0';
2960
2961                 list = 0;
2962                 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2963                     syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2964                         list = 1;
2965                 } else {
2966                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2967                         if (access(group_name, R_OK) == 0)
2968                                 list = 1;
2969                 }
2970
2971                 if (lists[list]) {
2972                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2973                 } else {
2974                         lists[list] = malloc(len);
2975                         if (lists[list] == NULL)
2976                                 goto out;
2977                         strcpy(lists[list], s);
2978                 }
2979
2980                 if (!sep)
2981                         break;
2982
2983                 *sep = ',';
2984                 s = sep + 1;
2985         }
2986
2987         if (lists[1] != NULL) {
2988                 struct strlist_config slist_config = {
2989                         .dirname = strace_groups_dir,
2990                 };
2991
2992                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2993                 if (trace->ev_qualifier == NULL) {
2994                         fputs("Not enough memory to parse event qualifier", trace->output);
2995                         goto out;
2996                 }
2997
2998                 if (trace__validate_ev_qualifier(trace))
2999                         goto out;
3000                 trace->trace_syscalls = true;
3001         }
3002
3003         err = 0;
3004
3005         if (lists[0]) {
3006                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3007                                                "event selector. use 'perf list' to list available events",
3008                                                parse_events_option);
3009                 err = parse_events_option(&o, lists[0], 0);
3010         }
3011 out:
3012         if (sep)
3013                 *sep = ',';
3014
3015         return err;
3016 }
3017
3018 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3019 {
3020         struct trace *trace = opt->value;
3021
3022         if (!list_empty(&trace->evlist->entries))
3023                 return parse_cgroups(opt, str, unset);
3024
3025         trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3026
3027         return 0;
3028 }
3029
3030 int cmd_trace(int argc, const char **argv)
3031 {
3032         const char *trace_usage[] = {
3033                 "perf trace [<options>] [<command>]",
3034                 "perf trace [<options>] -- <command> [<options>]",
3035                 "perf trace record [<options>] [<command>]",
3036                 "perf trace record [<options>] -- <command> [<options>]",
3037                 NULL
3038         };
3039         struct trace trace = {
3040                 .syscalls = {
3041                         . max = -1,
3042                 },
3043                 .opts = {
3044                         .target = {
3045                                 .uid       = UINT_MAX,
3046                                 .uses_mmap = true,
3047                         },
3048                         .user_freq     = UINT_MAX,
3049                         .user_interval = ULLONG_MAX,
3050                         .no_buffering  = true,
3051                         .mmap_pages    = UINT_MAX,
3052                         .proc_map_timeout  = 500,
3053                 },
3054                 .output = stderr,
3055                 .show_comm = true,
3056                 .trace_syscalls = false,
3057                 .kernel_syscallchains = false,
3058                 .max_stack = UINT_MAX,
3059         };
3060         const char *output_name = NULL;
3061         const struct option trace_options[] = {
3062         OPT_CALLBACK('e', "event", &trace, "event",
3063                      "event/syscall selector. use 'perf list' to list available events",
3064                      trace__parse_events_option),
3065         OPT_BOOLEAN(0, "comm", &trace.show_comm,
3066                     "show the thread COMM next to its id"),
3067         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3068         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3069                      trace__parse_events_option),
3070         OPT_STRING('o', "output", &output_name, "file", "output file name"),
3071         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3072         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3073                     "trace events on existing process id"),
3074         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3075                     "trace events on existing thread id"),
3076         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3077                      "pids to filter (by the kernel)", trace__set_filter_pids),
3078         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3079                     "system-wide collection from all CPUs"),
3080         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3081                     "list of cpus to monitor"),
3082         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3083                     "child tasks do not inherit counters"),
3084         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3085                      "number of mmap data pages",
3086                      perf_evlist__parse_mmap_pages),
3087         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3088                    "user to profile"),
3089         OPT_CALLBACK(0, "duration", &trace, "float",
3090                      "show only events with duration > N.M ms",
3091                      trace__set_duration),
3092         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3093         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3094         OPT_BOOLEAN('T', "time", &trace.full_time,
3095                     "Show full timestamp, not time relative to first start"),
3096         OPT_BOOLEAN(0, "failure", &trace.failure_only,
3097                     "Show only syscalls that failed"),
3098         OPT_BOOLEAN('s', "summary", &trace.summary_only,
3099                     "Show only syscall summary with statistics"),
3100         OPT_BOOLEAN('S', "with-summary", &trace.summary,
3101                     "Show all syscalls and summary with statistics"),
3102         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3103                      "Trace pagefaults", parse_pagefaults, "maj"),
3104         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3105         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3106         OPT_CALLBACK(0, "call-graph", &trace.opts,
3107                      "record_mode[,record_size]", record_callchain_help,
3108                      &record_parse_callchain_opt),
3109         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3110                     "Show the kernel callchains on the syscall exit path"),
3111         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3112                      "Set the minimum stack depth when parsing the callchain, "
3113                      "anything below the specified depth will be ignored."),
3114         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3115                      "Set the maximum stack depth when parsing the callchain, "
3116                      "anything beyond the specified depth will be ignored. "
3117                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3118         OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3119                         "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3120         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3121                         "per thread proc mmap processing timeout in ms"),
3122         OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3123                      trace__parse_cgroups),
3124         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3125                      "ms to wait before starting measurement after program "
3126                      "start"),
3127         OPT_END()
3128         };
3129         bool __maybe_unused max_stack_user_set = true;
3130         bool mmap_pages_user_set = true;
3131         const char * const trace_subcommands[] = { "record", NULL };
3132         int err;
3133         char bf[BUFSIZ];
3134
3135         signal(SIGSEGV, sighandler_dump_stack);
3136         signal(SIGFPE, sighandler_dump_stack);
3137
3138         trace.evlist = perf_evlist__new();
3139         trace.sctbl = syscalltbl__new();
3140
3141         if (trace.evlist == NULL || trace.sctbl == NULL) {
3142                 pr_err("Not enough memory to run!\n");
3143                 err = -ENOMEM;
3144                 goto out;
3145         }
3146
3147         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3148                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3149
3150         if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3151                 usage_with_options_msg(trace_usage, trace_options,
3152                                        "cgroup monitoring only available in system-wide mode");
3153         }
3154
3155         err = bpf__setup_stdout(trace.evlist);
3156         if (err) {
3157                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3158                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3159                 goto out;
3160         }
3161
3162         err = -1;
3163
3164         if (trace.trace_pgfaults) {
3165                 trace.opts.sample_address = true;
3166                 trace.opts.sample_time = true;
3167         }
3168
3169         if (trace.opts.mmap_pages == UINT_MAX)
3170                 mmap_pages_user_set = false;
3171
3172         if (trace.max_stack == UINT_MAX) {
3173                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3174                 max_stack_user_set = false;
3175         }
3176
3177 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3178         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3179                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3180         }
3181 #endif
3182
3183         if (callchain_param.enabled) {
3184                 if (!mmap_pages_user_set && geteuid() == 0)
3185                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3186
3187                 symbol_conf.use_callchain = true;
3188         }
3189
3190         if (trace.evlist->nr_entries > 0)
3191                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3192
3193         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3194                 return trace__record(&trace, argc-1, &argv[1]);
3195
3196         /* summary_only implies summary option, but don't overwrite summary if set */
3197         if (trace.summary_only)
3198                 trace.summary = trace.summary_only;
3199
3200         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3201             trace.evlist->nr_entries == 0 /* Was --events used? */) {
3202                 trace.trace_syscalls = true;
3203         }
3204
3205         if (output_name != NULL) {
3206                 err = trace__open_output(&trace, output_name);
3207                 if (err < 0) {
3208                         perror("failed to create output file");
3209                         goto out;
3210                 }
3211         }
3212
3213         err = target__validate(&trace.opts.target);
3214         if (err) {
3215                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3216                 fprintf(trace.output, "%s", bf);
3217                 goto out_close;
3218         }
3219
3220         err = target__parse_uid(&trace.opts.target);
3221         if (err) {
3222                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3223                 fprintf(trace.output, "%s", bf);
3224                 goto out_close;
3225         }
3226
3227         if (!argc && target__none(&trace.opts.target))
3228                 trace.opts.target.system_wide = true;
3229
3230         if (input_name)
3231                 err = trace__replay(&trace);
3232         else
3233                 err = trace__run(&trace, argc, argv);
3234
3235 out_close:
3236         if (output_name != NULL)
3237                 fclose(trace.output);
3238 out:
3239         return err;
3240 }