perf bpf: Make bpf__setup_output_event() return the bpf-output event
[linux-2.6-microblaze.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
25 #include "util/env.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
44 #include "string2.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
47
48 #include <errno.h>
49 #include <inttypes.h>
50 #include <poll.h>
51 #include <signal.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 #include <fcntl.h>
61
62 #include "sane_ctype.h"
63
64 #ifndef O_CLOEXEC
65 # define O_CLOEXEC              02000000
66 #endif
67
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE  1024
70 #endif
71
72 struct trace {
73         struct perf_tool        tool;
74         struct syscalltbl       *sctbl;
75         struct {
76                 int             max;
77                 struct syscall  *table;
78                 struct {
79                         struct perf_evsel *sys_enter,
80                                           *sys_exit;
81                 }               events;
82         } syscalls;
83         struct record_opts      opts;
84         struct perf_evlist      *evlist;
85         struct machine          *host;
86         struct thread           *current;
87         struct cgroup           *cgroup;
88         u64                     base_time;
89         FILE                    *output;
90         unsigned long           nr_events;
91         struct strlist          *ev_qualifier;
92         struct {
93                 size_t          nr;
94                 int             *entries;
95         }                       ev_qualifier_ids;
96         struct {
97                 size_t          nr;
98                 pid_t           *entries;
99         }                       filter_pids;
100         double                  duration_filter;
101         double                  runtime_ms;
102         struct {
103                 u64             vfs_getname,
104                                 proc_getname;
105         } stats;
106         unsigned int            max_stack;
107         unsigned int            min_stack;
108         bool                    not_ev_qualifier;
109         bool                    live;
110         bool                    full_time;
111         bool                    sched;
112         bool                    multiple_threads;
113         bool                    summary;
114         bool                    summary_only;
115         bool                    failure_only;
116         bool                    show_comm;
117         bool                    print_sample;
118         bool                    show_tool_stats;
119         bool                    trace_syscalls;
120         bool                    kernel_syscallchains;
121         bool                    force;
122         bool                    vfs_getname;
123         int                     trace_pgfaults;
124 };
125
126 struct tp_field {
127         int offset;
128         union {
129                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
130                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
131         };
132 };
133
134 #define TP_UINT_FIELD(bits) \
135 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
136 { \
137         u##bits value; \
138         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
139         return value;  \
140 }
141
142 TP_UINT_FIELD(8);
143 TP_UINT_FIELD(16);
144 TP_UINT_FIELD(32);
145 TP_UINT_FIELD(64);
146
147 #define TP_UINT_FIELD__SWAPPED(bits) \
148 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
149 { \
150         u##bits value; \
151         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
152         return bswap_##bits(value);\
153 }
154
155 TP_UINT_FIELD__SWAPPED(16);
156 TP_UINT_FIELD__SWAPPED(32);
157 TP_UINT_FIELD__SWAPPED(64);
158
159 static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
160 {
161         field->offset = offset;
162
163         switch (size) {
164         case 1:
165                 field->integer = tp_field__u8;
166                 break;
167         case 2:
168                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
169                 break;
170         case 4:
171                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
172                 break;
173         case 8:
174                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
175                 break;
176         default:
177                 return -1;
178         }
179
180         return 0;
181 }
182
183 static int tp_field__init_uint(struct tp_field *field, struct format_field *format_field, bool needs_swap)
184 {
185         return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
186 }
187
188 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
189 {
190         return sample->raw_data + field->offset;
191 }
192
193 static int __tp_field__init_ptr(struct tp_field *field, int offset)
194 {
195         field->offset = offset;
196         field->pointer = tp_field__ptr;
197         return 0;
198 }
199
200 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
201 {
202         return __tp_field__init_ptr(field, format_field->offset);
203 }
204
205 struct syscall_tp {
206         struct tp_field id;
207         union {
208                 struct tp_field args, ret;
209         };
210 };
211
212 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
213                                           struct tp_field *field,
214                                           const char *name)
215 {
216         struct format_field *format_field = perf_evsel__field(evsel, name);
217
218         if (format_field == NULL)
219                 return -1;
220
221         return tp_field__init_uint(field, format_field, evsel->needs_swap);
222 }
223
224 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
225         ({ struct syscall_tp *sc = evsel->priv;\
226            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
227
228 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
229                                          struct tp_field *field,
230                                          const char *name)
231 {
232         struct format_field *format_field = perf_evsel__field(evsel, name);
233
234         if (format_field == NULL)
235                 return -1;
236
237         return tp_field__init_ptr(field, format_field);
238 }
239
240 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
241         ({ struct syscall_tp *sc = evsel->priv;\
242            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
243
244 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
245 {
246         zfree(&evsel->priv);
247         perf_evsel__delete(evsel);
248 }
249
250 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel)
251 {
252         struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
253
254         if (evsel->priv != NULL) {
255                 if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr"))
256                         goto out_delete;
257                 return 0;
258         }
259
260         return -ENOMEM;
261 out_delete:
262         zfree(&evsel->priv);
263         return -ENOENT;
264 }
265
266 static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler)
267 {
268         evsel->priv = malloc(sizeof(struct syscall_tp));
269         if (evsel->priv != NULL) {
270                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
271                         goto out_delete;
272
273                 evsel->handler = handler;
274                 return 0;
275         }
276
277         return -ENOMEM;
278
279 out_delete:
280         zfree(&evsel->priv);
281         return -ENOENT;
282 }
283
284 static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
285 {
286         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
287
288         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
289         if (IS_ERR(evsel))
290                 evsel = perf_evsel__newtp("syscalls", direction);
291
292         if (IS_ERR(evsel))
293                 return NULL;
294
295         if (perf_evsel__init_raw_syscall_tp(evsel, handler))
296                 goto out_delete;
297
298         return evsel;
299
300 out_delete:
301         perf_evsel__delete_priv(evsel);
302         return NULL;
303 }
304
305 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
306         ({ struct syscall_tp *fields = evsel->priv; \
307            fields->name.integer(&fields->name, sample); })
308
309 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
310         ({ struct syscall_tp *fields = evsel->priv; \
311            fields->name.pointer(&fields->name, sample); })
312
313 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
314 {
315         int idx = val - sa->offset;
316
317         if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL)
318                 return scnprintf(bf, size, intfmt, val);
319
320         return scnprintf(bf, size, "%s", sa->entries[idx]);
321 }
322
323 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
324                                                 const char *intfmt,
325                                                 struct syscall_arg *arg)
326 {
327         return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
328 }
329
330 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
331                                               struct syscall_arg *arg)
332 {
333         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
334 }
335
336 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
337
338 struct strarrays {
339         int             nr_entries;
340         struct strarray **entries;
341 };
342
343 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
344         .nr_entries = ARRAY_SIZE(array), \
345         .entries = array, \
346 }
347
348 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
349                                         struct syscall_arg *arg)
350 {
351         struct strarrays *sas = arg->parm;
352         int i;
353
354         for (i = 0; i < sas->nr_entries; ++i) {
355                 struct strarray *sa = sas->entries[i];
356                 int idx = arg->val - sa->offset;
357
358                 if (idx >= 0 && idx < sa->nr_entries) {
359                         if (sa->entries[idx] == NULL)
360                                 break;
361                         return scnprintf(bf, size, "%s", sa->entries[idx]);
362                 }
363         }
364
365         return scnprintf(bf, size, "%d", arg->val);
366 }
367
368 #ifndef AT_FDCWD
369 #define AT_FDCWD        -100
370 #endif
371
372 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
373                                            struct syscall_arg *arg)
374 {
375         int fd = arg->val;
376
377         if (fd == AT_FDCWD)
378                 return scnprintf(bf, size, "CWD");
379
380         return syscall_arg__scnprintf_fd(bf, size, arg);
381 }
382
383 #define SCA_FDAT syscall_arg__scnprintf_fd_at
384
385 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
386                                               struct syscall_arg *arg);
387
388 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
389
390 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
391 {
392         return scnprintf(bf, size, "%#lx", arg->val);
393 }
394
395 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
396 {
397         return scnprintf(bf, size, "%d", arg->val);
398 }
399
400 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
401 {
402         return scnprintf(bf, size, "%ld", arg->val);
403 }
404
405 static const char *bpf_cmd[] = {
406         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
407         "MAP_GET_NEXT_KEY", "PROG_LOAD",
408 };
409 static DEFINE_STRARRAY(bpf_cmd);
410
411 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
412 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
413
414 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
415 static DEFINE_STRARRAY(itimers);
416
417 static const char *keyctl_options[] = {
418         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
419         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
420         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
421         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
422         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
423 };
424 static DEFINE_STRARRAY(keyctl_options);
425
426 static const char *whences[] = { "SET", "CUR", "END",
427 #ifdef SEEK_DATA
428 "DATA",
429 #endif
430 #ifdef SEEK_HOLE
431 "HOLE",
432 #endif
433 };
434 static DEFINE_STRARRAY(whences);
435
436 static const char *fcntl_cmds[] = {
437         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
438         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
439         "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
440         "GETOWNER_UIDS",
441 };
442 static DEFINE_STRARRAY(fcntl_cmds);
443
444 static const char *fcntl_linux_specific_cmds[] = {
445         "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
446         "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
447         "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
448 };
449
450 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
451
452 static struct strarray *fcntl_cmds_arrays[] = {
453         &strarray__fcntl_cmds,
454         &strarray__fcntl_linux_specific_cmds,
455 };
456
457 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
458
459 static const char *rlimit_resources[] = {
460         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
461         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
462         "RTTIME",
463 };
464 static DEFINE_STRARRAY(rlimit_resources);
465
466 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
467 static DEFINE_STRARRAY(sighow);
468
469 static const char *clockid[] = {
470         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
471         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
472         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
473 };
474 static DEFINE_STRARRAY(clockid);
475
476 static const char *socket_families[] = {
477         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
478         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
479         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
480         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
481         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
482         "ALG", "NFC", "VSOCK",
483 };
484 static DEFINE_STRARRAY(socket_families);
485
486 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
487                                                  struct syscall_arg *arg)
488 {
489         size_t printed = 0;
490         int mode = arg->val;
491
492         if (mode == F_OK) /* 0 */
493                 return scnprintf(bf, size, "F");
494 #define P_MODE(n) \
495         if (mode & n##_OK) { \
496                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
497                 mode &= ~n##_OK; \
498         }
499
500         P_MODE(R);
501         P_MODE(W);
502         P_MODE(X);
503 #undef P_MODE
504
505         if (mode)
506                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
507
508         return printed;
509 }
510
511 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
512
513 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
514                                               struct syscall_arg *arg);
515
516 #define SCA_FILENAME syscall_arg__scnprintf_filename
517
518 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
519                                                 struct syscall_arg *arg)
520 {
521         int printed = 0, flags = arg->val;
522
523 #define P_FLAG(n) \
524         if (flags & O_##n) { \
525                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
526                 flags &= ~O_##n; \
527         }
528
529         P_FLAG(CLOEXEC);
530         P_FLAG(NONBLOCK);
531 #undef P_FLAG
532
533         if (flags)
534                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
535
536         return printed;
537 }
538
539 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
540
541 #ifndef GRND_NONBLOCK
542 #define GRND_NONBLOCK   0x0001
543 #endif
544 #ifndef GRND_RANDOM
545 #define GRND_RANDOM     0x0002
546 #endif
547
548 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
549                                                    struct syscall_arg *arg)
550 {
551         int printed = 0, flags = arg->val;
552
553 #define P_FLAG(n) \
554         if (flags & GRND_##n) { \
555                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
556                 flags &= ~GRND_##n; \
557         }
558
559         P_FLAG(RANDOM);
560         P_FLAG(NONBLOCK);
561 #undef P_FLAG
562
563         if (flags)
564                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
565
566         return printed;
567 }
568
569 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
570
571 #define STRARRAY(name, array) \
572           { .scnprintf  = SCA_STRARRAY, \
573             .parm       = &strarray__##array, }
574
575 #include "trace/beauty/arch_errno_names.c"
576 #include "trace/beauty/eventfd.c"
577 #include "trace/beauty/futex_op.c"
578 #include "trace/beauty/futex_val3.c"
579 #include "trace/beauty/mmap.c"
580 #include "trace/beauty/mode_t.c"
581 #include "trace/beauty/msg_flags.c"
582 #include "trace/beauty/open_flags.c"
583 #include "trace/beauty/perf_event_open.c"
584 #include "trace/beauty/pid.c"
585 #include "trace/beauty/sched_policy.c"
586 #include "trace/beauty/seccomp.c"
587 #include "trace/beauty/signum.c"
588 #include "trace/beauty/socket_type.c"
589 #include "trace/beauty/waitid_options.c"
590
591 struct syscall_arg_fmt {
592         size_t     (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
593         void       *parm;
594         const char *name;
595         bool       show_zero;
596 };
597
598 static struct syscall_fmt {
599         const char *name;
600         const char *alias;
601         struct syscall_arg_fmt arg[6];
602         u8         nr_args;
603         bool       errpid;
604         bool       timeout;
605         bool       hexret;
606 } syscall_fmts[] = {
607         { .name     = "access",
608           .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
609         { .name     = "bpf",
610           .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
611         { .name     = "brk",        .hexret = true,
612           .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
613         { .name     = "clock_gettime",
614           .arg = { [0] = STRARRAY(clk_id, clockid), }, },
615         { .name     = "clone",      .errpid = true, .nr_args = 5,
616           .arg = { [0] = { .name = "flags",         .scnprintf = SCA_CLONE_FLAGS, },
617                    [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
618                    [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
619                    [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
620                    [4] = { .name = "tls",           .scnprintf = SCA_HEX, }, }, },
621         { .name     = "close",
622           .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
623         { .name     = "epoll_ctl",
624           .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
625         { .name     = "eventfd2",
626           .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
627         { .name     = "fchmodat",
628           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
629         { .name     = "fchownat",
630           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
631         { .name     = "fcntl",
632           .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
633                            .parm      = &strarrays__fcntl_cmds_arrays,
634                            .show_zero = true, },
635                    [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
636         { .name     = "flock",
637           .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
638         { .name     = "fstat", .alias = "newfstat", },
639         { .name     = "fstatat", .alias = "newfstatat", },
640         { .name     = "futex",
641           .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
642                    [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
643         { .name     = "futimesat",
644           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
645         { .name     = "getitimer",
646           .arg = { [0] = STRARRAY(which, itimers), }, },
647         { .name     = "getpid",     .errpid = true, },
648         { .name     = "getpgid",    .errpid = true, },
649         { .name     = "getppid",    .errpid = true, },
650         { .name     = "getrandom",
651           .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
652         { .name     = "getrlimit",
653           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
654         { .name     = "gettid",     .errpid = true, },
655         { .name     = "ioctl",
656           .arg = {
657 #if defined(__i386__) || defined(__x86_64__)
658 /*
659  * FIXME: Make this available to all arches.
660  */
661                    [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
662                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
663 #else
664                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
665 #endif
666         { .name     = "kcmp",       .nr_args = 5,
667           .arg = { [0] = { .name = "pid1",      .scnprintf = SCA_PID, },
668                    [1] = { .name = "pid2",      .scnprintf = SCA_PID, },
669                    [2] = { .name = "type",      .scnprintf = SCA_KCMP_TYPE, },
670                    [3] = { .name = "idx1",      .scnprintf = SCA_KCMP_IDX, },
671                    [4] = { .name = "idx2",      .scnprintf = SCA_KCMP_IDX, }, }, },
672         { .name     = "keyctl",
673           .arg = { [0] = STRARRAY(option, keyctl_options), }, },
674         { .name     = "kill",
675           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
676         { .name     = "linkat",
677           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
678         { .name     = "lseek",
679           .arg = { [2] = STRARRAY(whence, whences), }, },
680         { .name     = "lstat", .alias = "newlstat", },
681         { .name     = "madvise",
682           .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
683                    [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
684         { .name     = "mkdirat",
685           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
686         { .name     = "mknodat",
687           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
688         { .name     = "mlock",
689           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
690         { .name     = "mlockall",
691           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
692         { .name     = "mmap",       .hexret = true,
693 /* The standard mmap maps to old_mmap on s390x */
694 #if defined(__s390x__)
695         .alias = "old_mmap",
696 #endif
697           .arg = { [0] = { .scnprintf = SCA_HEX,        /* addr */ },
698                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
699                    [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
700         { .name     = "mprotect",
701           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
702                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ }, }, },
703         { .name     = "mq_unlink",
704           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
705         { .name     = "mremap",     .hexret = true,
706           .arg = { [0] = { .scnprintf = SCA_HEX,          /* addr */ },
707                    [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
708                    [4] = { .scnprintf = SCA_HEX,          /* new_addr */ }, }, },
709         { .name     = "munlock",
710           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
711         { .name     = "munmap",
712           .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
713         { .name     = "name_to_handle_at",
714           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
715         { .name     = "newfstatat",
716           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
717         { .name     = "open",
718           .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
719         { .name     = "open_by_handle_at",
720           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
721                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
722         { .name     = "openat",
723           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
724                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
725         { .name     = "perf_event_open",
726           .arg = { [2] = { .scnprintf = SCA_INT,        /* cpu */ },
727                    [3] = { .scnprintf = SCA_FD,         /* group_fd */ },
728                    [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
729         { .name     = "pipe2",
730           .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
731         { .name     = "pkey_alloc",
732           .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,   /* access_rights */ }, }, },
733         { .name     = "pkey_free",
734           .arg = { [0] = { .scnprintf = SCA_INT,        /* key */ }, }, },
735         { .name     = "pkey_mprotect",
736           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
737                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
738                    [3] = { .scnprintf = SCA_INT,        /* pkey */ }, }, },
739         { .name     = "poll", .timeout = true, },
740         { .name     = "ppoll", .timeout = true, },
741         { .name     = "prctl", .alias = "arch_prctl",
742           .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
743                    [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
744                    [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
745         { .name     = "pread", .alias = "pread64", },
746         { .name     = "preadv", .alias = "pread", },
747         { .name     = "prlimit64",
748           .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
749         { .name     = "pwrite", .alias = "pwrite64", },
750         { .name     = "readlinkat",
751           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
752         { .name     = "recvfrom",
753           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
754         { .name     = "recvmmsg",
755           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
756         { .name     = "recvmsg",
757           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
758         { .name     = "renameat",
759           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
760         { .name     = "rt_sigaction",
761           .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
762         { .name     = "rt_sigprocmask",
763           .arg = { [0] = STRARRAY(how, sighow), }, },
764         { .name     = "rt_sigqueueinfo",
765           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
766         { .name     = "rt_tgsigqueueinfo",
767           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
768         { .name     = "sched_setscheduler",
769           .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
770         { .name     = "seccomp",
771           .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,    /* op */ },
772                    [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
773         { .name     = "select", .timeout = true, },
774         { .name     = "sendmmsg",
775           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
776         { .name     = "sendmsg",
777           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
778         { .name     = "sendto",
779           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
780         { .name     = "set_tid_address", .errpid = true, },
781         { .name     = "setitimer",
782           .arg = { [0] = STRARRAY(which, itimers), }, },
783         { .name     = "setrlimit",
784           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
785         { .name     = "socket",
786           .arg = { [0] = STRARRAY(family, socket_families),
787                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
788                    [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
789         { .name     = "socketpair",
790           .arg = { [0] = STRARRAY(family, socket_families),
791                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
792                    [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
793         { .name     = "stat", .alias = "newstat", },
794         { .name     = "statx",
795           .arg = { [0] = { .scnprintf = SCA_FDAT,        /* fdat */ },
796                    [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
797                    [3] = { .scnprintf = SCA_STATX_MASK,  /* mask */ }, }, },
798         { .name     = "swapoff",
799           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
800         { .name     = "swapon",
801           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
802         { .name     = "symlinkat",
803           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
804         { .name     = "tgkill",
805           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
806         { .name     = "tkill",
807           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
808         { .name     = "uname", .alias = "newuname", },
809         { .name     = "unlinkat",
810           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
811         { .name     = "utimensat",
812           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
813         { .name     = "wait4",      .errpid = true,
814           .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
815         { .name     = "waitid",     .errpid = true,
816           .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
817 };
818
819 static int syscall_fmt__cmp(const void *name, const void *fmtp)
820 {
821         const struct syscall_fmt *fmt = fmtp;
822         return strcmp(name, fmt->name);
823 }
824
825 static struct syscall_fmt *syscall_fmt__find(const char *name)
826 {
827         const int nmemb = ARRAY_SIZE(syscall_fmts);
828         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
829 }
830
831 /*
832  * is_exit: is this "exit" or "exit_group"?
833  * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
834  */
835 struct syscall {
836         struct event_format *tp_format;
837         int                 nr_args;
838         bool                is_exit;
839         bool                is_open;
840         struct format_field *args;
841         const char          *name;
842         struct syscall_fmt  *fmt;
843         struct syscall_arg_fmt *arg_fmt;
844 };
845
846 /*
847  * We need to have this 'calculated' boolean because in some cases we really
848  * don't know what is the duration of a syscall, for instance, when we start
849  * a session and some threads are waiting for a syscall to finish, say 'poll',
850  * in which case all we can do is to print "( ? ) for duration and for the
851  * start timestamp.
852  */
853 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
854 {
855         double duration = (double)t / NSEC_PER_MSEC;
856         size_t printed = fprintf(fp, "(");
857
858         if (!calculated)
859                 printed += fprintf(fp, "         ");
860         else if (duration >= 1.0)
861                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
862         else if (duration >= 0.01)
863                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
864         else
865                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
866         return printed + fprintf(fp, "): ");
867 }
868
869 /**
870  * filename.ptr: The filename char pointer that will be vfs_getname'd
871  * filename.entry_str_pos: Where to insert the string translated from
872  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
873  * ret_scnprintf: syscall args may set this to a different syscall return
874  *                formatter, for instance, fcntl may return fds, file flags, etc.
875  */
876 struct thread_trace {
877         u64               entry_time;
878         bool              entry_pending;
879         unsigned long     nr_events;
880         unsigned long     pfmaj, pfmin;
881         char              *entry_str;
882         double            runtime_ms;
883         size_t            (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
884         struct {
885                 unsigned long ptr;
886                 short int     entry_str_pos;
887                 bool          pending_open;
888                 unsigned int  namelen;
889                 char          *name;
890         } filename;
891         struct {
892                 int       max;
893                 char      **table;
894         } paths;
895
896         struct intlist *syscall_stats;
897 };
898
899 static struct thread_trace *thread_trace__new(void)
900 {
901         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
902
903         if (ttrace)
904                 ttrace->paths.max = -1;
905
906         ttrace->syscall_stats = intlist__new(NULL);
907
908         return ttrace;
909 }
910
911 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
912 {
913         struct thread_trace *ttrace;
914
915         if (thread == NULL)
916                 goto fail;
917
918         if (thread__priv(thread) == NULL)
919                 thread__set_priv(thread, thread_trace__new());
920
921         if (thread__priv(thread) == NULL)
922                 goto fail;
923
924         ttrace = thread__priv(thread);
925         ++ttrace->nr_events;
926
927         return ttrace;
928 fail:
929         color_fprintf(fp, PERF_COLOR_RED,
930                       "WARNING: not enough memory, dropping samples!\n");
931         return NULL;
932 }
933
934
935 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
936                                     size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
937 {
938         struct thread_trace *ttrace = thread__priv(arg->thread);
939
940         ttrace->ret_scnprintf = ret_scnprintf;
941 }
942
943 #define TRACE_PFMAJ             (1 << 0)
944 #define TRACE_PFMIN             (1 << 1)
945
946 static const size_t trace__entry_str_size = 2048;
947
948 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
949 {
950         struct thread_trace *ttrace = thread__priv(thread);
951
952         if (fd > ttrace->paths.max) {
953                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
954
955                 if (npath == NULL)
956                         return -1;
957
958                 if (ttrace->paths.max != -1) {
959                         memset(npath + ttrace->paths.max + 1, 0,
960                                (fd - ttrace->paths.max) * sizeof(char *));
961                 } else {
962                         memset(npath, 0, (fd + 1) * sizeof(char *));
963                 }
964
965                 ttrace->paths.table = npath;
966                 ttrace->paths.max   = fd;
967         }
968
969         ttrace->paths.table[fd] = strdup(pathname);
970
971         return ttrace->paths.table[fd] != NULL ? 0 : -1;
972 }
973
974 static int thread__read_fd_path(struct thread *thread, int fd)
975 {
976         char linkname[PATH_MAX], pathname[PATH_MAX];
977         struct stat st;
978         int ret;
979
980         if (thread->pid_ == thread->tid) {
981                 scnprintf(linkname, sizeof(linkname),
982                           "/proc/%d/fd/%d", thread->pid_, fd);
983         } else {
984                 scnprintf(linkname, sizeof(linkname),
985                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
986         }
987
988         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
989                 return -1;
990
991         ret = readlink(linkname, pathname, sizeof(pathname));
992
993         if (ret < 0 || ret > st.st_size)
994                 return -1;
995
996         pathname[ret] = '\0';
997         return trace__set_fd_pathname(thread, fd, pathname);
998 }
999
1000 static const char *thread__fd_path(struct thread *thread, int fd,
1001                                    struct trace *trace)
1002 {
1003         struct thread_trace *ttrace = thread__priv(thread);
1004
1005         if (ttrace == NULL)
1006                 return NULL;
1007
1008         if (fd < 0)
1009                 return NULL;
1010
1011         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1012                 if (!trace->live)
1013                         return NULL;
1014                 ++trace->stats.proc_getname;
1015                 if (thread__read_fd_path(thread, fd))
1016                         return NULL;
1017         }
1018
1019         return ttrace->paths.table[fd];
1020 }
1021
1022 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1023 {
1024         int fd = arg->val;
1025         size_t printed = scnprintf(bf, size, "%d", fd);
1026         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1027
1028         if (path)
1029                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1030
1031         return printed;
1032 }
1033
1034 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1035 {
1036         size_t printed = scnprintf(bf, size, "%d", fd);
1037         struct thread *thread = machine__find_thread(trace->host, pid, pid);
1038
1039         if (thread) {
1040                 const char *path = thread__fd_path(thread, fd, trace);
1041
1042                 if (path)
1043                         printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1044
1045                 thread__put(thread);
1046         }
1047
1048         return printed;
1049 }
1050
1051 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1052                                               struct syscall_arg *arg)
1053 {
1054         int fd = arg->val;
1055         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1056         struct thread_trace *ttrace = thread__priv(arg->thread);
1057
1058         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1059                 zfree(&ttrace->paths.table[fd]);
1060
1061         return printed;
1062 }
1063
1064 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1065                                      unsigned long ptr)
1066 {
1067         struct thread_trace *ttrace = thread__priv(thread);
1068
1069         ttrace->filename.ptr = ptr;
1070         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1071 }
1072
1073 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1074                                               struct syscall_arg *arg)
1075 {
1076         unsigned long ptr = arg->val;
1077
1078         if (!arg->trace->vfs_getname)
1079                 return scnprintf(bf, size, "%#x", ptr);
1080
1081         thread__set_filename_pos(arg->thread, bf, ptr);
1082         return 0;
1083 }
1084
1085 static bool trace__filter_duration(struct trace *trace, double t)
1086 {
1087         return t < (trace->duration_filter * NSEC_PER_MSEC);
1088 }
1089
1090 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1091 {
1092         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1093
1094         return fprintf(fp, "%10.3f ", ts);
1095 }
1096
1097 /*
1098  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1099  * using ttrace->entry_time for a thread that receives a sys_exit without
1100  * first having received a sys_enter ("poll" issued before tracing session
1101  * starts, lost sys_enter exit due to ring buffer overflow).
1102  */
1103 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1104 {
1105         if (tstamp > 0)
1106                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1107
1108         return fprintf(fp, "         ? ");
1109 }
1110
1111 static bool done = false;
1112 static bool interrupted = false;
1113
1114 static void sig_handler(int sig)
1115 {
1116         done = true;
1117         interrupted = sig == SIGINT;
1118 }
1119
1120 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1121                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1122 {
1123         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1124         printed += fprintf_duration(duration, duration_calculated, fp);
1125
1126         if (trace->multiple_threads) {
1127                 if (trace->show_comm)
1128                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1129                 printed += fprintf(fp, "%d ", thread->tid);
1130         }
1131
1132         return printed;
1133 }
1134
1135 static int trace__process_event(struct trace *trace, struct machine *machine,
1136                                 union perf_event *event, struct perf_sample *sample)
1137 {
1138         int ret = 0;
1139
1140         switch (event->header.type) {
1141         case PERF_RECORD_LOST:
1142                 color_fprintf(trace->output, PERF_COLOR_RED,
1143                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1144                 ret = machine__process_lost_event(machine, event, sample);
1145                 break;
1146         default:
1147                 ret = machine__process_event(machine, event, sample);
1148                 break;
1149         }
1150
1151         return ret;
1152 }
1153
1154 static int trace__tool_process(struct perf_tool *tool,
1155                                union perf_event *event,
1156                                struct perf_sample *sample,
1157                                struct machine *machine)
1158 {
1159         struct trace *trace = container_of(tool, struct trace, tool);
1160         return trace__process_event(trace, machine, event, sample);
1161 }
1162
1163 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1164 {
1165         struct machine *machine = vmachine;
1166
1167         if (machine->kptr_restrict_warned)
1168                 return NULL;
1169
1170         if (symbol_conf.kptr_restrict) {
1171                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1172                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1173                            "Kernel samples will not be resolved.\n");
1174                 machine->kptr_restrict_warned = true;
1175                 return NULL;
1176         }
1177
1178         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1179 }
1180
1181 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1182 {
1183         int err = symbol__init(NULL);
1184
1185         if (err)
1186                 return err;
1187
1188         trace->host = machine__new_host();
1189         if (trace->host == NULL)
1190                 return -ENOMEM;
1191
1192         err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1193         if (err < 0)
1194                 goto out;
1195
1196         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1197                                             evlist->threads, trace__tool_process, false,
1198                                             trace->opts.proc_map_timeout, 1);
1199 out:
1200         if (err)
1201                 symbol__exit();
1202
1203         return err;
1204 }
1205
1206 static void trace__symbols__exit(struct trace *trace)
1207 {
1208         machine__exit(trace->host);
1209         trace->host = NULL;
1210
1211         symbol__exit();
1212 }
1213
1214 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1215 {
1216         int idx;
1217
1218         if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1219                 nr_args = sc->fmt->nr_args;
1220
1221         sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1222         if (sc->arg_fmt == NULL)
1223                 return -1;
1224
1225         for (idx = 0; idx < nr_args; ++idx) {
1226                 if (sc->fmt)
1227                         sc->arg_fmt[idx] = sc->fmt->arg[idx];
1228         }
1229
1230         sc->nr_args = nr_args;
1231         return 0;
1232 }
1233
1234 static int syscall__set_arg_fmts(struct syscall *sc)
1235 {
1236         struct format_field *field;
1237         int idx = 0, len;
1238
1239         for (field = sc->args; field; field = field->next, ++idx) {
1240                 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1241                         continue;
1242
1243                 if (strcmp(field->type, "const char *") == 0 &&
1244                          (strcmp(field->name, "filename") == 0 ||
1245                           strcmp(field->name, "path") == 0 ||
1246                           strcmp(field->name, "pathname") == 0))
1247                         sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1248                 else if (field->flags & FIELD_IS_POINTER)
1249                         sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1250                 else if (strcmp(field->type, "pid_t") == 0)
1251                         sc->arg_fmt[idx].scnprintf = SCA_PID;
1252                 else if (strcmp(field->type, "umode_t") == 0)
1253                         sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1254                 else if ((strcmp(field->type, "int") == 0 ||
1255                           strcmp(field->type, "unsigned int") == 0 ||
1256                           strcmp(field->type, "long") == 0) &&
1257                          (len = strlen(field->name)) >= 2 &&
1258                          strcmp(field->name + len - 2, "fd") == 0) {
1259                         /*
1260                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1261                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1262                          * 65 int
1263                          * 23 unsigned int
1264                          * 7 unsigned long
1265                          */
1266                         sc->arg_fmt[idx].scnprintf = SCA_FD;
1267                 }
1268         }
1269
1270         return 0;
1271 }
1272
1273 static int trace__read_syscall_info(struct trace *trace, int id)
1274 {
1275         char tp_name[128];
1276         struct syscall *sc;
1277         const char *name = syscalltbl__name(trace->sctbl, id);
1278
1279         if (name == NULL)
1280                 return -1;
1281
1282         if (id > trace->syscalls.max) {
1283                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1284
1285                 if (nsyscalls == NULL)
1286                         return -1;
1287
1288                 if (trace->syscalls.max != -1) {
1289                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1290                                (id - trace->syscalls.max) * sizeof(*sc));
1291                 } else {
1292                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1293                 }
1294
1295                 trace->syscalls.table = nsyscalls;
1296                 trace->syscalls.max   = id;
1297         }
1298
1299         sc = trace->syscalls.table + id;
1300         sc->name = name;
1301
1302         sc->fmt  = syscall_fmt__find(sc->name);
1303
1304         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1305         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1306
1307         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1308                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1309                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1310         }
1311
1312         if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1313                 return -1;
1314
1315         if (IS_ERR(sc->tp_format))
1316                 return -1;
1317
1318         sc->args = sc->tp_format->format.fields;
1319         /*
1320          * We need to check and discard the first variable '__syscall_nr'
1321          * or 'nr' that mean the syscall number. It is needless here.
1322          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1323          */
1324         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1325                 sc->args = sc->args->next;
1326                 --sc->nr_args;
1327         }
1328
1329         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1330         sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1331
1332         return syscall__set_arg_fmts(sc);
1333 }
1334
1335 static int trace__validate_ev_qualifier(struct trace *trace)
1336 {
1337         int err = 0, i;
1338         size_t nr_allocated;
1339         struct str_node *pos;
1340
1341         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1342         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1343                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1344
1345         if (trace->ev_qualifier_ids.entries == NULL) {
1346                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1347                        trace->output);
1348                 err = -EINVAL;
1349                 goto out;
1350         }
1351
1352         nr_allocated = trace->ev_qualifier_ids.nr;
1353         i = 0;
1354
1355         strlist__for_each_entry(pos, trace->ev_qualifier) {
1356                 const char *sc = pos->s;
1357                 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1358
1359                 if (id < 0) {
1360                         id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1361                         if (id >= 0)
1362                                 goto matches;
1363
1364                         if (err == 0) {
1365                                 fputs("Error:\tInvalid syscall ", trace->output);
1366                                 err = -EINVAL;
1367                         } else {
1368                                 fputs(", ", trace->output);
1369                         }
1370
1371                         fputs(sc, trace->output);
1372                 }
1373 matches:
1374                 trace->ev_qualifier_ids.entries[i++] = id;
1375                 if (match_next == -1)
1376                         continue;
1377
1378                 while (1) {
1379                         id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1380                         if (id < 0)
1381                                 break;
1382                         if (nr_allocated == trace->ev_qualifier_ids.nr) {
1383                                 void *entries;
1384
1385                                 nr_allocated += 8;
1386                                 entries = realloc(trace->ev_qualifier_ids.entries,
1387                                                   nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1388                                 if (entries == NULL) {
1389                                         err = -ENOMEM;
1390                                         fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1391                                         goto out_free;
1392                                 }
1393                                 trace->ev_qualifier_ids.entries = entries;
1394                         }
1395                         trace->ev_qualifier_ids.nr++;
1396                         trace->ev_qualifier_ids.entries[i++] = id;
1397                 }
1398         }
1399
1400         if (err < 0) {
1401                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1402                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1403 out_free:
1404                 zfree(&trace->ev_qualifier_ids.entries);
1405                 trace->ev_qualifier_ids.nr = 0;
1406         }
1407 out:
1408         return err;
1409 }
1410
1411 /*
1412  * args is to be interpreted as a series of longs but we need to handle
1413  * 8-byte unaligned accesses. args points to raw_data within the event
1414  * and raw_data is guaranteed to be 8-byte unaligned because it is
1415  * preceded by raw_size which is a u32. So we need to copy args to a temp
1416  * variable to read it. Most notably this avoids extended load instructions
1417  * on unaligned addresses
1418  */
1419 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1420 {
1421         unsigned long val;
1422         unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1423
1424         memcpy(&val, p, sizeof(val));
1425         return val;
1426 }
1427
1428 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1429                                       struct syscall_arg *arg)
1430 {
1431         if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1432                 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1433
1434         return scnprintf(bf, size, "arg%d: ", arg->idx);
1435 }
1436
1437 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1438                                      struct syscall_arg *arg, unsigned long val)
1439 {
1440         if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1441                 arg->val = val;
1442                 if (sc->arg_fmt[arg->idx].parm)
1443                         arg->parm = sc->arg_fmt[arg->idx].parm;
1444                 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1445         }
1446         return scnprintf(bf, size, "%ld", val);
1447 }
1448
1449 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1450                                       unsigned char *args, struct trace *trace,
1451                                       struct thread *thread)
1452 {
1453         size_t printed = 0;
1454         unsigned long val;
1455         u8 bit = 1;
1456         struct syscall_arg arg = {
1457                 .args   = args,
1458                 .idx    = 0,
1459                 .mask   = 0,
1460                 .trace  = trace,
1461                 .thread = thread,
1462         };
1463         struct thread_trace *ttrace = thread__priv(thread);
1464
1465         /*
1466          * Things like fcntl will set this in its 'cmd' formatter to pick the
1467          * right formatter for the return value (an fd? file flags?), which is
1468          * not needed for syscalls that always return a given type, say an fd.
1469          */
1470         ttrace->ret_scnprintf = NULL;
1471
1472         if (sc->args != NULL) {
1473                 struct format_field *field;
1474
1475                 for (field = sc->args; field;
1476                      field = field->next, ++arg.idx, bit <<= 1) {
1477                         if (arg.mask & bit)
1478                                 continue;
1479
1480                         val = syscall_arg__val(&arg, arg.idx);
1481
1482                         /*
1483                          * Suppress this argument if its value is zero and
1484                          * and we don't have a string associated in an
1485                          * strarray for it.
1486                          */
1487                         if (val == 0 &&
1488                             !(sc->arg_fmt &&
1489                               (sc->arg_fmt[arg.idx].show_zero ||
1490                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1491                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1492                               sc->arg_fmt[arg.idx].parm))
1493                                 continue;
1494
1495                         printed += scnprintf(bf + printed, size - printed,
1496                                              "%s%s: ", printed ? ", " : "", field->name);
1497                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1498                 }
1499         } else if (IS_ERR(sc->tp_format)) {
1500                 /*
1501                  * If we managed to read the tracepoint /format file, then we
1502                  * may end up not having any args, like with gettid(), so only
1503                  * print the raw args when we didn't manage to read it.
1504                  */
1505                 while (arg.idx < sc->nr_args) {
1506                         if (arg.mask & bit)
1507                                 goto next_arg;
1508                         val = syscall_arg__val(&arg, arg.idx);
1509                         if (printed)
1510                                 printed += scnprintf(bf + printed, size - printed, ", ");
1511                         printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1512                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1513 next_arg:
1514                         ++arg.idx;
1515                         bit <<= 1;
1516                 }
1517         }
1518
1519         return printed;
1520 }
1521
1522 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1523                                   union perf_event *event,
1524                                   struct perf_sample *sample);
1525
1526 static struct syscall *trace__syscall_info(struct trace *trace,
1527                                            struct perf_evsel *evsel, int id)
1528 {
1529
1530         if (id < 0) {
1531
1532                 /*
1533                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1534                  * before that, leaving at a higher verbosity level till that is
1535                  * explained. Reproduced with plain ftrace with:
1536                  *
1537                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1538                  * grep "NR -1 " /t/trace_pipe
1539                  *
1540                  * After generating some load on the machine.
1541                  */
1542                 if (verbose > 1) {
1543                         static u64 n;
1544                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1545                                 id, perf_evsel__name(evsel), ++n);
1546                 }
1547                 return NULL;
1548         }
1549
1550         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1551             trace__read_syscall_info(trace, id))
1552                 goto out_cant_read;
1553
1554         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1555                 goto out_cant_read;
1556
1557         return &trace->syscalls.table[id];
1558
1559 out_cant_read:
1560         if (verbose > 0) {
1561                 fprintf(trace->output, "Problems reading syscall %d", id);
1562                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1563                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1564                 fputs(" information\n", trace->output);
1565         }
1566         return NULL;
1567 }
1568
1569 static void thread__update_stats(struct thread_trace *ttrace,
1570                                  int id, struct perf_sample *sample)
1571 {
1572         struct int_node *inode;
1573         struct stats *stats;
1574         u64 duration = 0;
1575
1576         inode = intlist__findnew(ttrace->syscall_stats, id);
1577         if (inode == NULL)
1578                 return;
1579
1580         stats = inode->priv;
1581         if (stats == NULL) {
1582                 stats = malloc(sizeof(struct stats));
1583                 if (stats == NULL)
1584                         return;
1585                 init_stats(stats);
1586                 inode->priv = stats;
1587         }
1588
1589         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1590                 duration = sample->time - ttrace->entry_time;
1591
1592         update_stats(stats, duration);
1593 }
1594
1595 static int trace__printf_interrupted_entry(struct trace *trace)
1596 {
1597         struct thread_trace *ttrace;
1598         size_t printed;
1599
1600         if (trace->failure_only || trace->current == NULL)
1601                 return 0;
1602
1603         ttrace = thread__priv(trace->current);
1604
1605         if (!ttrace->entry_pending)
1606                 return 0;
1607
1608         printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1609         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1610         ttrace->entry_pending = false;
1611
1612         return printed;
1613 }
1614
1615 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1616                                  struct perf_sample *sample, struct thread *thread)
1617 {
1618         int printed = 0;
1619
1620         if (trace->print_sample) {
1621                 double ts = (double)sample->time / NSEC_PER_MSEC;
1622
1623                 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1624                                    perf_evsel__name(evsel), ts,
1625                                    thread__comm_str(thread),
1626                                    sample->pid, sample->tid, sample->cpu);
1627         }
1628
1629         return printed;
1630 }
1631
1632 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1633                             union perf_event *event __maybe_unused,
1634                             struct perf_sample *sample)
1635 {
1636         char *msg;
1637         void *args;
1638         size_t printed = 0;
1639         struct thread *thread;
1640         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1641         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1642         struct thread_trace *ttrace;
1643
1644         if (sc == NULL)
1645                 return -1;
1646
1647         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1648         ttrace = thread__trace(thread, trace->output);
1649         if (ttrace == NULL)
1650                 goto out_put;
1651
1652         trace__fprintf_sample(trace, evsel, sample, thread);
1653
1654         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1655
1656         if (ttrace->entry_str == NULL) {
1657                 ttrace->entry_str = malloc(trace__entry_str_size);
1658                 if (!ttrace->entry_str)
1659                         goto out_put;
1660         }
1661
1662         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1663                 trace__printf_interrupted_entry(trace);
1664
1665         ttrace->entry_time = sample->time;
1666         msg = ttrace->entry_str;
1667         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1668
1669         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1670                                            args, trace, thread);
1671
1672         if (sc->is_exit) {
1673                 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1674                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1675                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1676                 }
1677         } else {
1678                 ttrace->entry_pending = true;
1679                 /* See trace__vfs_getname & trace__sys_exit */
1680                 ttrace->filename.pending_open = false;
1681         }
1682
1683         if (trace->current != thread) {
1684                 thread__put(trace->current);
1685                 trace->current = thread__get(thread);
1686         }
1687         err = 0;
1688 out_put:
1689         thread__put(thread);
1690         return err;
1691 }
1692
1693 static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel,
1694                                     struct perf_sample *sample)
1695 {
1696         struct thread_trace *ttrace;
1697         struct thread *thread;
1698         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1699         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1700         char msg[1024];
1701         void *args;
1702
1703         if (sc == NULL)
1704                 return -1;
1705
1706         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1707         ttrace = thread__trace(thread, trace->output);
1708         /*
1709          * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
1710          * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
1711          */
1712         if (ttrace == NULL)
1713                 goto out_put;
1714
1715         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1716         syscall__scnprintf_args(sc, msg, sizeof(msg), args, trace, thread);
1717         fprintf(trace->output, "%s", msg);
1718         err = 0;
1719 out_put:
1720         thread__put(thread);
1721         return err;
1722 }
1723
1724 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1725                                     struct perf_sample *sample,
1726                                     struct callchain_cursor *cursor)
1727 {
1728         struct addr_location al;
1729         int max_stack = evsel->attr.sample_max_stack ?
1730                         evsel->attr.sample_max_stack :
1731                         trace->max_stack;
1732
1733         if (machine__resolve(trace->host, &al, sample) < 0 ||
1734             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1735                 return -1;
1736
1737         return 0;
1738 }
1739
1740 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1741 {
1742         /* TODO: user-configurable print_opts */
1743         const unsigned int print_opts = EVSEL__PRINT_SYM |
1744                                         EVSEL__PRINT_DSO |
1745                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1746
1747         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1748 }
1749
1750 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1751 {
1752         struct perf_env *env = perf_evsel__env(evsel);
1753         const char *arch_name = perf_env__arch(env);
1754
1755         return arch_syscalls__strerrno(arch_name, err);
1756 }
1757
1758 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1759                            union perf_event *event __maybe_unused,
1760                            struct perf_sample *sample)
1761 {
1762         long ret;
1763         u64 duration = 0;
1764         bool duration_calculated = false;
1765         struct thread *thread;
1766         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1767         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1768         struct thread_trace *ttrace;
1769
1770         if (sc == NULL)
1771                 return -1;
1772
1773         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1774         ttrace = thread__trace(thread, trace->output);
1775         if (ttrace == NULL)
1776                 goto out_put;
1777
1778         trace__fprintf_sample(trace, evsel, sample, thread);
1779
1780         if (trace->summary)
1781                 thread__update_stats(ttrace, id, sample);
1782
1783         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1784
1785         if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
1786                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1787                 ttrace->filename.pending_open = false;
1788                 ++trace->stats.vfs_getname;
1789         }
1790
1791         if (ttrace->entry_time) {
1792                 duration = sample->time - ttrace->entry_time;
1793                 if (trace__filter_duration(trace, duration))
1794                         goto out;
1795                 duration_calculated = true;
1796         } else if (trace->duration_filter)
1797                 goto out;
1798
1799         if (sample->callchain) {
1800                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1801                 if (callchain_ret == 0) {
1802                         if (callchain_cursor.nr < trace->min_stack)
1803                                 goto out;
1804                         callchain_ret = 1;
1805                 }
1806         }
1807
1808         if (trace->summary_only || (ret >= 0 && trace->failure_only))
1809                 goto out;
1810
1811         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1812
1813         if (ttrace->entry_pending) {
1814                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1815         } else {
1816                 fprintf(trace->output, " ... [");
1817                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1818                 fprintf(trace->output, "]: %s()", sc->name);
1819         }
1820
1821         if (sc->fmt == NULL) {
1822                 if (ret < 0)
1823                         goto errno_print;
1824 signed_print:
1825                 fprintf(trace->output, ") = %ld", ret);
1826         } else if (ret < 0) {
1827 errno_print: {
1828                 char bf[STRERR_BUFSIZE];
1829                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1830                            *e = errno_to_name(evsel, -ret);
1831
1832                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1833         }
1834         } else if (ret == 0 && sc->fmt->timeout)
1835                 fprintf(trace->output, ") = 0 Timeout");
1836         else if (ttrace->ret_scnprintf) {
1837                 char bf[1024];
1838                 struct syscall_arg arg = {
1839                         .val    = ret,
1840                         .thread = thread,
1841                         .trace  = trace,
1842                 };
1843                 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1844                 ttrace->ret_scnprintf = NULL;
1845                 fprintf(trace->output, ") = %s", bf);
1846         } else if (sc->fmt->hexret)
1847                 fprintf(trace->output, ") = %#lx", ret);
1848         else if (sc->fmt->errpid) {
1849                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1850
1851                 if (child != NULL) {
1852                         fprintf(trace->output, ") = %ld", ret);
1853                         if (child->comm_set)
1854                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1855                         thread__put(child);
1856                 }
1857         } else
1858                 goto signed_print;
1859
1860         fputc('\n', trace->output);
1861
1862         if (callchain_ret > 0)
1863                 trace__fprintf_callchain(trace, sample);
1864         else if (callchain_ret < 0)
1865                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1866 out:
1867         ttrace->entry_pending = false;
1868         err = 0;
1869 out_put:
1870         thread__put(thread);
1871         return err;
1872 }
1873
1874 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1875                               union perf_event *event __maybe_unused,
1876                               struct perf_sample *sample)
1877 {
1878         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1879         struct thread_trace *ttrace;
1880         size_t filename_len, entry_str_len, to_move;
1881         ssize_t remaining_space;
1882         char *pos;
1883         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1884
1885         if (!thread)
1886                 goto out;
1887
1888         ttrace = thread__priv(thread);
1889         if (!ttrace)
1890                 goto out_put;
1891
1892         filename_len = strlen(filename);
1893         if (filename_len == 0)
1894                 goto out_put;
1895
1896         if (ttrace->filename.namelen < filename_len) {
1897                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1898
1899                 if (f == NULL)
1900                         goto out_put;
1901
1902                 ttrace->filename.namelen = filename_len;
1903                 ttrace->filename.name = f;
1904         }
1905
1906         strcpy(ttrace->filename.name, filename);
1907         ttrace->filename.pending_open = true;
1908
1909         if (!ttrace->filename.ptr)
1910                 goto out_put;
1911
1912         entry_str_len = strlen(ttrace->entry_str);
1913         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1914         if (remaining_space <= 0)
1915                 goto out_put;
1916
1917         if (filename_len > (size_t)remaining_space) {
1918                 filename += filename_len - remaining_space;
1919                 filename_len = remaining_space;
1920         }
1921
1922         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1923         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1924         memmove(pos + filename_len, pos, to_move);
1925         memcpy(pos, filename, filename_len);
1926
1927         ttrace->filename.ptr = 0;
1928         ttrace->filename.entry_str_pos = 0;
1929 out_put:
1930         thread__put(thread);
1931 out:
1932         return 0;
1933 }
1934
1935 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1936                                      union perf_event *event __maybe_unused,
1937                                      struct perf_sample *sample)
1938 {
1939         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1940         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1941         struct thread *thread = machine__findnew_thread(trace->host,
1942                                                         sample->pid,
1943                                                         sample->tid);
1944         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1945
1946         if (ttrace == NULL)
1947                 goto out_dump;
1948
1949         ttrace->runtime_ms += runtime_ms;
1950         trace->runtime_ms += runtime_ms;
1951 out_put:
1952         thread__put(thread);
1953         return 0;
1954
1955 out_dump:
1956         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1957                evsel->name,
1958                perf_evsel__strval(evsel, sample, "comm"),
1959                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1960                runtime,
1961                perf_evsel__intval(evsel, sample, "vruntime"));
1962         goto out_put;
1963 }
1964
1965 static int bpf_output__printer(enum binary_printer_ops op,
1966                                unsigned int val, void *extra __maybe_unused, FILE *fp)
1967 {
1968         unsigned char ch = (unsigned char)val;
1969
1970         switch (op) {
1971         case BINARY_PRINT_CHAR_DATA:
1972                 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1973         case BINARY_PRINT_DATA_BEGIN:
1974         case BINARY_PRINT_LINE_BEGIN:
1975         case BINARY_PRINT_ADDR:
1976         case BINARY_PRINT_NUM_DATA:
1977         case BINARY_PRINT_NUM_PAD:
1978         case BINARY_PRINT_SEP:
1979         case BINARY_PRINT_CHAR_PAD:
1980         case BINARY_PRINT_LINE_END:
1981         case BINARY_PRINT_DATA_END:
1982         default:
1983                 break;
1984         }
1985
1986         return 0;
1987 }
1988
1989 static void bpf_output__fprintf(struct trace *trace,
1990                                 struct perf_sample *sample)
1991 {
1992         binary__fprintf(sample->raw_data, sample->raw_size, 8,
1993                         bpf_output__printer, NULL, trace->output);
1994 }
1995
1996 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1997                                 union perf_event *event __maybe_unused,
1998                                 struct perf_sample *sample)
1999 {
2000         int callchain_ret = 0;
2001
2002         if (sample->callchain) {
2003                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2004                 if (callchain_ret == 0) {
2005                         if (callchain_cursor.nr < trace->min_stack)
2006                                 goto out;
2007                         callchain_ret = 1;
2008                 }
2009         }
2010
2011         trace__printf_interrupted_entry(trace);
2012         trace__fprintf_tstamp(trace, sample->time, trace->output);
2013
2014         if (trace->trace_syscalls)
2015                 fprintf(trace->output, "(         ): ");
2016
2017         fprintf(trace->output, "%s:", evsel->name);
2018
2019         if (perf_evsel__is_bpf_output(evsel)) {
2020                 bpf_output__fprintf(trace, sample);
2021         } else if (evsel->tp_format) {
2022                 if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
2023                     trace__fprintf_sys_enter(trace, evsel, sample)) {
2024                         event_format__fprintf(evsel->tp_format, sample->cpu,
2025                                               sample->raw_data, sample->raw_size,
2026                                               trace->output);
2027                 }
2028         }
2029
2030         fprintf(trace->output, "\n");
2031
2032         if (callchain_ret > 0)
2033                 trace__fprintf_callchain(trace, sample);
2034         else if (callchain_ret < 0)
2035                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2036 out:
2037         return 0;
2038 }
2039
2040 static void print_location(FILE *f, struct perf_sample *sample,
2041                            struct addr_location *al,
2042                            bool print_dso, bool print_sym)
2043 {
2044
2045         if ((verbose > 0 || print_dso) && al->map)
2046                 fprintf(f, "%s@", al->map->dso->long_name);
2047
2048         if ((verbose > 0 || print_sym) && al->sym)
2049                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2050                         al->addr - al->sym->start);
2051         else if (al->map)
2052                 fprintf(f, "0x%" PRIx64, al->addr);
2053         else
2054                 fprintf(f, "0x%" PRIx64, sample->addr);
2055 }
2056
2057 static int trace__pgfault(struct trace *trace,
2058                           struct perf_evsel *evsel,
2059                           union perf_event *event __maybe_unused,
2060                           struct perf_sample *sample)
2061 {
2062         struct thread *thread;
2063         struct addr_location al;
2064         char map_type = 'd';
2065         struct thread_trace *ttrace;
2066         int err = -1;
2067         int callchain_ret = 0;
2068
2069         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2070
2071         if (sample->callchain) {
2072                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2073                 if (callchain_ret == 0) {
2074                         if (callchain_cursor.nr < trace->min_stack)
2075                                 goto out_put;
2076                         callchain_ret = 1;
2077                 }
2078         }
2079
2080         ttrace = thread__trace(thread, trace->output);
2081         if (ttrace == NULL)
2082                 goto out_put;
2083
2084         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2085                 ttrace->pfmaj++;
2086         else
2087                 ttrace->pfmin++;
2088
2089         if (trace->summary_only)
2090                 goto out;
2091
2092         thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2093
2094         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2095
2096         fprintf(trace->output, "%sfault [",
2097                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2098                 "maj" : "min");
2099
2100         print_location(trace->output, sample, &al, false, true);
2101
2102         fprintf(trace->output, "] => ");
2103
2104         thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2105
2106         if (!al.map) {
2107                 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2108
2109                 if (al.map)
2110                         map_type = 'x';
2111                 else
2112                         map_type = '?';
2113         }
2114
2115         print_location(trace->output, sample, &al, true, false);
2116
2117         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2118
2119         if (callchain_ret > 0)
2120                 trace__fprintf_callchain(trace, sample);
2121         else if (callchain_ret < 0)
2122                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2123 out:
2124         err = 0;
2125 out_put:
2126         thread__put(thread);
2127         return err;
2128 }
2129
2130 static void trace__set_base_time(struct trace *trace,
2131                                  struct perf_evsel *evsel,
2132                                  struct perf_sample *sample)
2133 {
2134         /*
2135          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2136          * and don't use sample->time unconditionally, we may end up having
2137          * some other event in the future without PERF_SAMPLE_TIME for good
2138          * reason, i.e. we may not be interested in its timestamps, just in
2139          * it taking place, picking some piece of information when it
2140          * appears in our event stream (vfs_getname comes to mind).
2141          */
2142         if (trace->base_time == 0 && !trace->full_time &&
2143             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2144                 trace->base_time = sample->time;
2145 }
2146
2147 static int trace__process_sample(struct perf_tool *tool,
2148                                  union perf_event *event,
2149                                  struct perf_sample *sample,
2150                                  struct perf_evsel *evsel,
2151                                  struct machine *machine __maybe_unused)
2152 {
2153         struct trace *trace = container_of(tool, struct trace, tool);
2154         struct thread *thread;
2155         int err = 0;
2156
2157         tracepoint_handler handler = evsel->handler;
2158
2159         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2160         if (thread && thread__is_filtered(thread))
2161                 goto out;
2162
2163         trace__set_base_time(trace, evsel, sample);
2164
2165         if (handler) {
2166                 ++trace->nr_events;
2167                 handler(trace, evsel, event, sample);
2168         }
2169 out:
2170         thread__put(thread);
2171         return err;
2172 }
2173
2174 static int trace__record(struct trace *trace, int argc, const char **argv)
2175 {
2176         unsigned int rec_argc, i, j;
2177         const char **rec_argv;
2178         const char * const record_args[] = {
2179                 "record",
2180                 "-R",
2181                 "-m", "1024",
2182                 "-c", "1",
2183         };
2184
2185         const char * const sc_args[] = { "-e", };
2186         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2187         const char * const majpf_args[] = { "-e", "major-faults" };
2188         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2189         const char * const minpf_args[] = { "-e", "minor-faults" };
2190         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2191
2192         /* +1 is for the event string below */
2193         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2194                 majpf_args_nr + minpf_args_nr + argc;
2195         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2196
2197         if (rec_argv == NULL)
2198                 return -ENOMEM;
2199
2200         j = 0;
2201         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2202                 rec_argv[j++] = record_args[i];
2203
2204         if (trace->trace_syscalls) {
2205                 for (i = 0; i < sc_args_nr; i++)
2206                         rec_argv[j++] = sc_args[i];
2207
2208                 /* event string may be different for older kernels - e.g., RHEL6 */
2209                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2210                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2211                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2212                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2213                 else {
2214                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2215                         free(rec_argv);
2216                         return -1;
2217                 }
2218         }
2219
2220         if (trace->trace_pgfaults & TRACE_PFMAJ)
2221                 for (i = 0; i < majpf_args_nr; i++)
2222                         rec_argv[j++] = majpf_args[i];
2223
2224         if (trace->trace_pgfaults & TRACE_PFMIN)
2225                 for (i = 0; i < minpf_args_nr; i++)
2226                         rec_argv[j++] = minpf_args[i];
2227
2228         for (i = 0; i < (unsigned int)argc; i++)
2229                 rec_argv[j++] = argv[i];
2230
2231         return cmd_record(j, rec_argv);
2232 }
2233
2234 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2235
2236 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2237 {
2238         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2239
2240         if (IS_ERR(evsel))
2241                 return false;
2242
2243         if (perf_evsel__field(evsel, "pathname") == NULL) {
2244                 perf_evsel__delete(evsel);
2245                 return false;
2246         }
2247
2248         evsel->handler = trace__vfs_getname;
2249         perf_evlist__add(evlist, evsel);
2250         return true;
2251 }
2252
2253 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2254 {
2255         struct perf_evsel *evsel;
2256         struct perf_event_attr attr = {
2257                 .type = PERF_TYPE_SOFTWARE,
2258                 .mmap_data = 1,
2259         };
2260
2261         attr.config = config;
2262         attr.sample_period = 1;
2263
2264         event_attr_init(&attr);
2265
2266         evsel = perf_evsel__new(&attr);
2267         if (evsel)
2268                 evsel->handler = trace__pgfault;
2269
2270         return evsel;
2271 }
2272
2273 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2274 {
2275         const u32 type = event->header.type;
2276         struct perf_evsel *evsel;
2277
2278         if (type != PERF_RECORD_SAMPLE) {
2279                 trace__process_event(trace, trace->host, event, sample);
2280                 return;
2281         }
2282
2283         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2284         if (evsel == NULL) {
2285                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2286                 return;
2287         }
2288
2289         trace__set_base_time(trace, evsel, sample);
2290
2291         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2292             sample->raw_data == NULL) {
2293                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2294                        perf_evsel__name(evsel), sample->tid,
2295                        sample->cpu, sample->raw_size);
2296         } else {
2297                 tracepoint_handler handler = evsel->handler;
2298                 handler(trace, evsel, event, sample);
2299         }
2300 }
2301
2302 static int trace__add_syscall_newtp(struct trace *trace)
2303 {
2304         int ret = -1;
2305         struct perf_evlist *evlist = trace->evlist;
2306         struct perf_evsel *sys_enter, *sys_exit;
2307
2308         sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
2309         if (sys_enter == NULL)
2310                 goto out;
2311
2312         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2313                 goto out_delete_sys_enter;
2314
2315         sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
2316         if (sys_exit == NULL)
2317                 goto out_delete_sys_enter;
2318
2319         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2320                 goto out_delete_sys_exit;
2321
2322         perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2323         perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2324
2325         perf_evlist__add(evlist, sys_enter);
2326         perf_evlist__add(evlist, sys_exit);
2327
2328         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2329                 /*
2330                  * We're interested only in the user space callchain
2331                  * leading to the syscall, allow overriding that for
2332                  * debugging reasons using --kernel_syscall_callchains
2333                  */
2334                 sys_exit->attr.exclude_callchain_kernel = 1;
2335         }
2336
2337         trace->syscalls.events.sys_enter = sys_enter;
2338         trace->syscalls.events.sys_exit  = sys_exit;
2339
2340         ret = 0;
2341 out:
2342         return ret;
2343
2344 out_delete_sys_exit:
2345         perf_evsel__delete_priv(sys_exit);
2346 out_delete_sys_enter:
2347         perf_evsel__delete_priv(sys_enter);
2348         goto out;
2349 }
2350
2351 static int trace__set_ev_qualifier_filter(struct trace *trace)
2352 {
2353         int err = -1;
2354         struct perf_evsel *sys_exit;
2355         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2356                                                 trace->ev_qualifier_ids.nr,
2357                                                 trace->ev_qualifier_ids.entries);
2358
2359         if (filter == NULL)
2360                 goto out_enomem;
2361
2362         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2363                                           filter)) {
2364                 sys_exit = trace->syscalls.events.sys_exit;
2365                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2366         }
2367
2368         free(filter);
2369 out:
2370         return err;
2371 out_enomem:
2372         errno = ENOMEM;
2373         goto out;
2374 }
2375
2376 static int trace__set_filter_loop_pids(struct trace *trace)
2377 {
2378         unsigned int nr = 1;
2379         pid_t pids[32] = {
2380                 getpid(),
2381         };
2382         struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2383
2384         while (thread && nr < ARRAY_SIZE(pids)) {
2385                 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2386
2387                 if (parent == NULL)
2388                         break;
2389
2390                 if (!strcmp(thread__comm_str(parent), "sshd")) {
2391                         pids[nr++] = parent->tid;
2392                         break;
2393                 }
2394                 thread = parent;
2395         }
2396
2397         return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2398 }
2399
2400 static int trace__run(struct trace *trace, int argc, const char **argv)
2401 {
2402         struct perf_evlist *evlist = trace->evlist;
2403         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2404         int err = -1, i;
2405         unsigned long before;
2406         const bool forks = argc > 0;
2407         bool draining = false;
2408
2409         trace->live = true;
2410
2411         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2412                 goto out_error_raw_syscalls;
2413
2414         if (trace->trace_syscalls)
2415                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2416
2417         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2418                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2419                 if (pgfault_maj == NULL)
2420                         goto out_error_mem;
2421                 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2422                 perf_evlist__add(evlist, pgfault_maj);
2423         }
2424
2425         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2426                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2427                 if (pgfault_min == NULL)
2428                         goto out_error_mem;
2429                 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2430                 perf_evlist__add(evlist, pgfault_min);
2431         }
2432
2433         if (trace->sched &&
2434             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2435                                    trace__sched_stat_runtime))
2436                 goto out_error_sched_stat_runtime;
2437
2438         /*
2439          * If a global cgroup was set, apply it to all the events without an
2440          * explicit cgroup. I.e.:
2441          *
2442          *      trace -G A -e sched:*switch
2443          *
2444          * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2445          * _and_ sched:sched_switch to the 'A' cgroup, while:
2446          *
2447          * trace -e sched:*switch -G A
2448          *
2449          * will only set the sched:sched_switch event to the 'A' cgroup, all the
2450          * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2451          * a cgroup (on the root cgroup, sys wide, etc).
2452          *
2453          * Multiple cgroups:
2454          *
2455          * trace -G A -e sched:*switch -G B
2456          *
2457          * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2458          * to the 'B' cgroup.
2459          *
2460          * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2461          * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2462          */
2463         if (trace->cgroup)
2464                 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2465
2466         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2467         if (err < 0) {
2468                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2469                 goto out_delete_evlist;
2470         }
2471
2472         err = trace__symbols_init(trace, evlist);
2473         if (err < 0) {
2474                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2475                 goto out_delete_evlist;
2476         }
2477
2478         perf_evlist__config(evlist, &trace->opts, &callchain_param);
2479
2480         signal(SIGCHLD, sig_handler);
2481         signal(SIGINT, sig_handler);
2482
2483         if (forks) {
2484                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2485                                                     argv, false, NULL);
2486                 if (err < 0) {
2487                         fprintf(trace->output, "Couldn't run the workload!\n");
2488                         goto out_delete_evlist;
2489                 }
2490         }
2491
2492         err = perf_evlist__open(evlist);
2493         if (err < 0)
2494                 goto out_error_open;
2495
2496         err = bpf__apply_obj_config();
2497         if (err) {
2498                 char errbuf[BUFSIZ];
2499
2500                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2501                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2502                          errbuf);
2503                 goto out_error_open;
2504         }
2505
2506         /*
2507          * Better not use !target__has_task() here because we need to cover the
2508          * case where no threads were specified in the command line, but a
2509          * workload was, and in that case we will fill in the thread_map when
2510          * we fork the workload in perf_evlist__prepare_workload.
2511          */
2512         if (trace->filter_pids.nr > 0)
2513                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2514         else if (thread_map__pid(evlist->threads, 0) == -1)
2515                 err = trace__set_filter_loop_pids(trace);
2516
2517         if (err < 0)
2518                 goto out_error_mem;
2519
2520         if (trace->ev_qualifier_ids.nr > 0) {
2521                 err = trace__set_ev_qualifier_filter(trace);
2522                 if (err < 0)
2523                         goto out_errno;
2524
2525                 pr_debug("event qualifier tracepoint filter: %s\n",
2526                          trace->syscalls.events.sys_exit->filter);
2527         }
2528
2529         err = perf_evlist__apply_filters(evlist, &evsel);
2530         if (err < 0)
2531                 goto out_error_apply_filters;
2532
2533         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2534         if (err < 0)
2535                 goto out_error_mmap;
2536
2537         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2538                 perf_evlist__enable(evlist);
2539
2540         if (forks)
2541                 perf_evlist__start_workload(evlist);
2542
2543         if (trace->opts.initial_delay) {
2544                 usleep(trace->opts.initial_delay * 1000);
2545                 perf_evlist__enable(evlist);
2546         }
2547
2548         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2549                                   evlist->threads->nr > 1 ||
2550                                   perf_evlist__first(evlist)->attr.inherit;
2551
2552         /*
2553          * Now that we already used evsel->attr to ask the kernel to setup the
2554          * events, lets reuse evsel->attr.sample_max_stack as the limit in
2555          * trace__resolve_callchain(), allowing per-event max-stack settings
2556          * to override an explicitely set --max-stack global setting.
2557          */
2558         evlist__for_each_entry(evlist, evsel) {
2559                 if (evsel__has_callchain(evsel) &&
2560                     evsel->attr.sample_max_stack == 0)
2561                         evsel->attr.sample_max_stack = trace->max_stack;
2562         }
2563 again:
2564         before = trace->nr_events;
2565
2566         for (i = 0; i < evlist->nr_mmaps; i++) {
2567                 union perf_event *event;
2568                 struct perf_mmap *md;
2569
2570                 md = &evlist->mmap[i];
2571                 if (perf_mmap__read_init(md) < 0)
2572                         continue;
2573
2574                 while ((event = perf_mmap__read_event(md)) != NULL) {
2575                         struct perf_sample sample;
2576
2577                         ++trace->nr_events;
2578
2579                         err = perf_evlist__parse_sample(evlist, event, &sample);
2580                         if (err) {
2581                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2582                                 goto next_event;
2583                         }
2584
2585                         trace__handle_event(trace, event, &sample);
2586 next_event:
2587                         perf_mmap__consume(md);
2588
2589                         if (interrupted)
2590                                 goto out_disable;
2591
2592                         if (done && !draining) {
2593                                 perf_evlist__disable(evlist);
2594                                 draining = true;
2595                         }
2596                 }
2597                 perf_mmap__read_done(md);
2598         }
2599
2600         if (trace->nr_events == before) {
2601                 int timeout = done ? 100 : -1;
2602
2603                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2604                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2605                                 draining = true;
2606
2607                         goto again;
2608                 }
2609         } else {
2610                 goto again;
2611         }
2612
2613 out_disable:
2614         thread__zput(trace->current);
2615
2616         perf_evlist__disable(evlist);
2617
2618         if (!err) {
2619                 if (trace->summary)
2620                         trace__fprintf_thread_summary(trace, trace->output);
2621
2622                 if (trace->show_tool_stats) {
2623                         fprintf(trace->output, "Stats:\n "
2624                                                " vfs_getname : %" PRIu64 "\n"
2625                                                " proc_getname: %" PRIu64 "\n",
2626                                 trace->stats.vfs_getname,
2627                                 trace->stats.proc_getname);
2628                 }
2629         }
2630
2631 out_delete_evlist:
2632         trace__symbols__exit(trace);
2633
2634         perf_evlist__delete(evlist);
2635         cgroup__put(trace->cgroup);
2636         trace->evlist = NULL;
2637         trace->live = false;
2638         return err;
2639 {
2640         char errbuf[BUFSIZ];
2641
2642 out_error_sched_stat_runtime:
2643         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2644         goto out_error;
2645
2646 out_error_raw_syscalls:
2647         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2648         goto out_error;
2649
2650 out_error_mmap:
2651         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2652         goto out_error;
2653
2654 out_error_open:
2655         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2656
2657 out_error:
2658         fprintf(trace->output, "%s\n", errbuf);
2659         goto out_delete_evlist;
2660
2661 out_error_apply_filters:
2662         fprintf(trace->output,
2663                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2664                 evsel->filter, perf_evsel__name(evsel), errno,
2665                 str_error_r(errno, errbuf, sizeof(errbuf)));
2666         goto out_delete_evlist;
2667 }
2668 out_error_mem:
2669         fprintf(trace->output, "Not enough memory to run!\n");
2670         goto out_delete_evlist;
2671
2672 out_errno:
2673         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2674         goto out_delete_evlist;
2675 }
2676
2677 static int trace__replay(struct trace *trace)
2678 {
2679         const struct perf_evsel_str_handler handlers[] = {
2680                 { "probe:vfs_getname",       trace__vfs_getname, },
2681         };
2682         struct perf_data data = {
2683                 .file      = {
2684                         .path = input_name,
2685                 },
2686                 .mode      = PERF_DATA_MODE_READ,
2687                 .force     = trace->force,
2688         };
2689         struct perf_session *session;
2690         struct perf_evsel *evsel;
2691         int err = -1;
2692
2693         trace->tool.sample        = trace__process_sample;
2694         trace->tool.mmap          = perf_event__process_mmap;
2695         trace->tool.mmap2         = perf_event__process_mmap2;
2696         trace->tool.comm          = perf_event__process_comm;
2697         trace->tool.exit          = perf_event__process_exit;
2698         trace->tool.fork          = perf_event__process_fork;
2699         trace->tool.attr          = perf_event__process_attr;
2700         trace->tool.tracing_data  = perf_event__process_tracing_data;
2701         trace->tool.build_id      = perf_event__process_build_id;
2702         trace->tool.namespaces    = perf_event__process_namespaces;
2703
2704         trace->tool.ordered_events = true;
2705         trace->tool.ordering_requires_timestamps = true;
2706
2707         /* add tid to output */
2708         trace->multiple_threads = true;
2709
2710         session = perf_session__new(&data, false, &trace->tool);
2711         if (session == NULL)
2712                 return -1;
2713
2714         if (trace->opts.target.pid)
2715                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2716
2717         if (trace->opts.target.tid)
2718                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2719
2720         if (symbol__init(&session->header.env) < 0)
2721                 goto out;
2722
2723         trace->host = &session->machines.host;
2724
2725         err = perf_session__set_tracepoints_handlers(session, handlers);
2726         if (err)
2727                 goto out;
2728
2729         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2730                                                      "raw_syscalls:sys_enter");
2731         /* older kernels have syscalls tp versus raw_syscalls */
2732         if (evsel == NULL)
2733                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2734                                                              "syscalls:sys_enter");
2735
2736         if (evsel &&
2737             (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
2738             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2739                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2740                 goto out;
2741         }
2742
2743         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2744                                                      "raw_syscalls:sys_exit");
2745         if (evsel == NULL)
2746                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2747                                                              "syscalls:sys_exit");
2748         if (evsel &&
2749             (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
2750             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2751                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2752                 goto out;
2753         }
2754
2755         evlist__for_each_entry(session->evlist, evsel) {
2756                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2757                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2758                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2759                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2760                         evsel->handler = trace__pgfault;
2761         }
2762
2763         setup_pager();
2764
2765         err = perf_session__process_events(session);
2766         if (err)
2767                 pr_err("Failed to process events, error %d", err);
2768
2769         else if (trace->summary)
2770                 trace__fprintf_thread_summary(trace, trace->output);
2771
2772 out:
2773         perf_session__delete(session);
2774
2775         return err;
2776 }
2777
2778 static size_t trace__fprintf_threads_header(FILE *fp)
2779 {
2780         size_t printed;
2781
2782         printed  = fprintf(fp, "\n Summary of events:\n\n");
2783
2784         return printed;
2785 }
2786
2787 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2788         struct stats    *stats;
2789         double          msecs;
2790         int             syscall;
2791 )
2792 {
2793         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2794         struct stats *stats = source->priv;
2795
2796         entry->syscall = source->i;
2797         entry->stats   = stats;
2798         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2799 }
2800
2801 static size_t thread__dump_stats(struct thread_trace *ttrace,
2802                                  struct trace *trace, FILE *fp)
2803 {
2804         size_t printed = 0;
2805         struct syscall *sc;
2806         struct rb_node *nd;
2807         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2808
2809         if (syscall_stats == NULL)
2810                 return 0;
2811
2812         printed += fprintf(fp, "\n");
2813
2814         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2815         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2816         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2817
2818         resort_rb__for_each_entry(nd, syscall_stats) {
2819                 struct stats *stats = syscall_stats_entry->stats;
2820                 if (stats) {
2821                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2822                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2823                         double avg = avg_stats(stats);
2824                         double pct;
2825                         u64 n = (u64) stats->n;
2826
2827                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2828                         avg /= NSEC_PER_MSEC;
2829
2830                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2831                         printed += fprintf(fp, "   %-15s", sc->name);
2832                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2833                                            n, syscall_stats_entry->msecs, min, avg);
2834                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2835                 }
2836         }
2837
2838         resort_rb__delete(syscall_stats);
2839         printed += fprintf(fp, "\n\n");
2840
2841         return printed;
2842 }
2843
2844 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2845 {
2846         size_t printed = 0;
2847         struct thread_trace *ttrace = thread__priv(thread);
2848         double ratio;
2849
2850         if (ttrace == NULL)
2851                 return 0;
2852
2853         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2854
2855         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2856         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2857         printed += fprintf(fp, "%.1f%%", ratio);
2858         if (ttrace->pfmaj)
2859                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2860         if (ttrace->pfmin)
2861                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2862         if (trace->sched)
2863                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2864         else if (fputc('\n', fp) != EOF)
2865                 ++printed;
2866
2867         printed += thread__dump_stats(ttrace, trace, fp);
2868
2869         return printed;
2870 }
2871
2872 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2873 {
2874         return ttrace ? ttrace->nr_events : 0;
2875 }
2876
2877 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2878         struct thread *thread;
2879 )
2880 {
2881         entry->thread = rb_entry(nd, struct thread, rb_node);
2882 }
2883
2884 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2885 {
2886         size_t printed = trace__fprintf_threads_header(fp);
2887         struct rb_node *nd;
2888         int i;
2889
2890         for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2891                 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2892
2893                 if (threads == NULL) {
2894                         fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2895                         return 0;
2896                 }
2897
2898                 resort_rb__for_each_entry(nd, threads)
2899                         printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2900
2901                 resort_rb__delete(threads);
2902         }
2903         return printed;
2904 }
2905
2906 static int trace__set_duration(const struct option *opt, const char *str,
2907                                int unset __maybe_unused)
2908 {
2909         struct trace *trace = opt->value;
2910
2911         trace->duration_filter = atof(str);
2912         return 0;
2913 }
2914
2915 static int trace__set_filter_pids(const struct option *opt, const char *str,
2916                                   int unset __maybe_unused)
2917 {
2918         int ret = -1;
2919         size_t i;
2920         struct trace *trace = opt->value;
2921         /*
2922          * FIXME: introduce a intarray class, plain parse csv and create a
2923          * { int nr, int entries[] } struct...
2924          */
2925         struct intlist *list = intlist__new(str);
2926
2927         if (list == NULL)
2928                 return -1;
2929
2930         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2931         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2932
2933         if (trace->filter_pids.entries == NULL)
2934                 goto out;
2935
2936         trace->filter_pids.entries[0] = getpid();
2937
2938         for (i = 1; i < trace->filter_pids.nr; ++i)
2939                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2940
2941         intlist__delete(list);
2942         ret = 0;
2943 out:
2944         return ret;
2945 }
2946
2947 static int trace__open_output(struct trace *trace, const char *filename)
2948 {
2949         struct stat st;
2950
2951         if (!stat(filename, &st) && st.st_size) {
2952                 char oldname[PATH_MAX];
2953
2954                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2955                 unlink(oldname);
2956                 rename(filename, oldname);
2957         }
2958
2959         trace->output = fopen(filename, "w");
2960
2961         return trace->output == NULL ? -errno : 0;
2962 }
2963
2964 static int parse_pagefaults(const struct option *opt, const char *str,
2965                             int unset __maybe_unused)
2966 {
2967         int *trace_pgfaults = opt->value;
2968
2969         if (strcmp(str, "all") == 0)
2970                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2971         else if (strcmp(str, "maj") == 0)
2972                 *trace_pgfaults |= TRACE_PFMAJ;
2973         else if (strcmp(str, "min") == 0)
2974                 *trace_pgfaults |= TRACE_PFMIN;
2975         else
2976                 return -1;
2977
2978         return 0;
2979 }
2980
2981 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2982 {
2983         struct perf_evsel *evsel;
2984
2985         evlist__for_each_entry(evlist, evsel)
2986                 evsel->handler = handler;
2987 }
2988
2989 static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist)
2990 {
2991         struct perf_evsel *evsel;
2992
2993         evlist__for_each_entry(evlist, evsel) {
2994                 if (evsel->priv || !evsel->tp_format)
2995                         continue;
2996
2997                 if (strcmp(evsel->tp_format->system, "syscalls"))
2998                         continue;
2999
3000                 if (perf_evsel__init_syscall_tp(evsel))
3001                         return -1;
3002
3003                 if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
3004                         struct syscall_tp *sc = evsel->priv;
3005
3006                         if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
3007                                 return -1;
3008                 } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
3009                         struct syscall_tp *sc = evsel->priv;
3010
3011                         if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
3012                                 return -1;
3013                 }
3014         }
3015
3016         return 0;
3017 }
3018
3019 /*
3020  * XXX: Hackish, just splitting the combined -e+--event (syscalls
3021  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
3022  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
3023  *
3024  * It'd be better to introduce a parse_options() variant that would return a
3025  * list with the terms it didn't match to an event...
3026  */
3027 static int trace__parse_events_option(const struct option *opt, const char *str,
3028                                       int unset __maybe_unused)
3029 {
3030         struct trace *trace = (struct trace *)opt->value;
3031         const char *s = str;
3032         char *sep = NULL, *lists[2] = { NULL, NULL, };
3033         int len = strlen(str) + 1, err = -1, list, idx;
3034         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
3035         char group_name[PATH_MAX];
3036
3037         if (strace_groups_dir == NULL)
3038                 return -1;
3039
3040         if (*s == '!') {
3041                 ++s;
3042                 trace->not_ev_qualifier = true;
3043         }
3044
3045         while (1) {
3046                 if ((sep = strchr(s, ',')) != NULL)
3047                         *sep = '\0';
3048
3049                 list = 0;
3050                 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
3051                     syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
3052                         list = 1;
3053                 } else {
3054                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
3055                         if (access(group_name, R_OK) == 0)
3056                                 list = 1;
3057                 }
3058
3059                 if (lists[list]) {
3060                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
3061                 } else {
3062                         lists[list] = malloc(len);
3063                         if (lists[list] == NULL)
3064                                 goto out;
3065                         strcpy(lists[list], s);
3066                 }
3067
3068                 if (!sep)
3069                         break;
3070
3071                 *sep = ',';
3072                 s = sep + 1;
3073         }
3074
3075         if (lists[1] != NULL) {
3076                 struct strlist_config slist_config = {
3077                         .dirname = strace_groups_dir,
3078                 };
3079
3080                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
3081                 if (trace->ev_qualifier == NULL) {
3082                         fputs("Not enough memory to parse event qualifier", trace->output);
3083                         goto out;
3084                 }
3085
3086                 if (trace__validate_ev_qualifier(trace))
3087                         goto out;
3088                 trace->trace_syscalls = true;
3089         }
3090
3091         err = 0;
3092
3093         if (lists[0]) {
3094                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3095                                                "event selector. use 'perf list' to list available events",
3096                                                parse_events_option);
3097                 err = parse_events_option(&o, lists[0], 0);
3098         }
3099 out:
3100         if (sep)
3101                 *sep = ',';
3102
3103         return err;
3104 }
3105
3106 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3107 {
3108         struct trace *trace = opt->value;
3109
3110         if (!list_empty(&trace->evlist->entries))
3111                 return parse_cgroups(opt, str, unset);
3112
3113         trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3114
3115         return 0;
3116 }
3117
3118 int cmd_trace(int argc, const char **argv)
3119 {
3120         const char *trace_usage[] = {
3121                 "perf trace [<options>] [<command>]",
3122                 "perf trace [<options>] -- <command> [<options>]",
3123                 "perf trace record [<options>] [<command>]",
3124                 "perf trace record [<options>] -- <command> [<options>]",
3125                 NULL
3126         };
3127         struct trace trace = {
3128                 .syscalls = {
3129                         . max = -1,
3130                 },
3131                 .opts = {
3132                         .target = {
3133                                 .uid       = UINT_MAX,
3134                                 .uses_mmap = true,
3135                         },
3136                         .user_freq     = UINT_MAX,
3137                         .user_interval = ULLONG_MAX,
3138                         .no_buffering  = true,
3139                         .mmap_pages    = UINT_MAX,
3140                         .proc_map_timeout  = 500,
3141                 },
3142                 .output = stderr,
3143                 .show_comm = true,
3144                 .trace_syscalls = false,
3145                 .kernel_syscallchains = false,
3146                 .max_stack = UINT_MAX,
3147         };
3148         const char *output_name = NULL;
3149         const struct option trace_options[] = {
3150         OPT_CALLBACK('e', "event", &trace, "event",
3151                      "event/syscall selector. use 'perf list' to list available events",
3152                      trace__parse_events_option),
3153         OPT_BOOLEAN(0, "comm", &trace.show_comm,
3154                     "show the thread COMM next to its id"),
3155         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3156         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3157                      trace__parse_events_option),
3158         OPT_STRING('o', "output", &output_name, "file", "output file name"),
3159         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3160         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3161                     "trace events on existing process id"),
3162         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3163                     "trace events on existing thread id"),
3164         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3165                      "pids to filter (by the kernel)", trace__set_filter_pids),
3166         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3167                     "system-wide collection from all CPUs"),
3168         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3169                     "list of cpus to monitor"),
3170         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3171                     "child tasks do not inherit counters"),
3172         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3173                      "number of mmap data pages",
3174                      perf_evlist__parse_mmap_pages),
3175         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3176                    "user to profile"),
3177         OPT_CALLBACK(0, "duration", &trace, "float",
3178                      "show only events with duration > N.M ms",
3179                      trace__set_duration),
3180         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3181         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3182         OPT_BOOLEAN('T', "time", &trace.full_time,
3183                     "Show full timestamp, not time relative to first start"),
3184         OPT_BOOLEAN(0, "failure", &trace.failure_only,
3185                     "Show only syscalls that failed"),
3186         OPT_BOOLEAN('s', "summary", &trace.summary_only,
3187                     "Show only syscall summary with statistics"),
3188         OPT_BOOLEAN('S', "with-summary", &trace.summary,
3189                     "Show all syscalls and summary with statistics"),
3190         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3191                      "Trace pagefaults", parse_pagefaults, "maj"),
3192         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3193         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3194         OPT_CALLBACK(0, "call-graph", &trace.opts,
3195                      "record_mode[,record_size]", record_callchain_help,
3196                      &record_parse_callchain_opt),
3197         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3198                     "Show the kernel callchains on the syscall exit path"),
3199         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3200                      "Set the minimum stack depth when parsing the callchain, "
3201                      "anything below the specified depth will be ignored."),
3202         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3203                      "Set the maximum stack depth when parsing the callchain, "
3204                      "anything beyond the specified depth will be ignored. "
3205                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3206         OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3207                         "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3208         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3209                         "per thread proc mmap processing timeout in ms"),
3210         OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3211                      trace__parse_cgroups),
3212         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3213                      "ms to wait before starting measurement after program "
3214                      "start"),
3215         OPT_END()
3216         };
3217         bool __maybe_unused max_stack_user_set = true;
3218         bool mmap_pages_user_set = true;
3219         struct perf_evsel *evsel;
3220         const char * const trace_subcommands[] = { "record", NULL };
3221         int err = -1;
3222         char bf[BUFSIZ];
3223
3224         signal(SIGSEGV, sighandler_dump_stack);
3225         signal(SIGFPE, sighandler_dump_stack);
3226
3227         trace.evlist = perf_evlist__new();
3228         trace.sctbl = syscalltbl__new();
3229
3230         if (trace.evlist == NULL || trace.sctbl == NULL) {
3231                 pr_err("Not enough memory to run!\n");
3232                 err = -ENOMEM;
3233                 goto out;
3234         }
3235
3236         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3237                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3238
3239         if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3240                 usage_with_options_msg(trace_usage, trace_options,
3241                                        "cgroup monitoring only available in system-wide mode");
3242         }
3243
3244         evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__");
3245         if (IS_ERR(evsel)) {
3246                 bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf));
3247                 pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf);
3248                 goto out;
3249         }
3250
3251         err = bpf__setup_stdout(trace.evlist);
3252         if (err) {
3253                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3254                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3255                 goto out;
3256         }
3257
3258         err = -1;
3259
3260         if (trace.trace_pgfaults) {
3261                 trace.opts.sample_address = true;
3262                 trace.opts.sample_time = true;
3263         }
3264
3265         if (trace.opts.mmap_pages == UINT_MAX)
3266                 mmap_pages_user_set = false;
3267
3268         if (trace.max_stack == UINT_MAX) {
3269                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3270                 max_stack_user_set = false;
3271         }
3272
3273 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3274         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3275                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3276         }
3277 #endif
3278
3279         if (callchain_param.enabled) {
3280                 if (!mmap_pages_user_set && geteuid() == 0)
3281                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3282
3283                 symbol_conf.use_callchain = true;
3284         }
3285
3286         if (trace.evlist->nr_entries > 0) {
3287                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3288                 if (evlist__set_syscall_tp_fields(trace.evlist)) {
3289                         perror("failed to set syscalls:* tracepoint fields");
3290                         goto out;
3291                 }
3292         }
3293
3294         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3295                 return trace__record(&trace, argc-1, &argv[1]);
3296
3297         /* summary_only implies summary option, but don't overwrite summary if set */
3298         if (trace.summary_only)
3299                 trace.summary = trace.summary_only;
3300
3301         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3302             trace.evlist->nr_entries == 0 /* Was --events used? */) {
3303                 trace.trace_syscalls = true;
3304         }
3305
3306         if (output_name != NULL) {
3307                 err = trace__open_output(&trace, output_name);
3308                 if (err < 0) {
3309                         perror("failed to create output file");
3310                         goto out;
3311                 }
3312         }
3313
3314         err = target__validate(&trace.opts.target);
3315         if (err) {
3316                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3317                 fprintf(trace.output, "%s", bf);
3318                 goto out_close;
3319         }
3320
3321         err = target__parse_uid(&trace.opts.target);
3322         if (err) {
3323                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3324                 fprintf(trace.output, "%s", bf);
3325                 goto out_close;
3326         }
3327
3328         if (!argc && target__none(&trace.opts.target))
3329                 trace.opts.target.system_wide = true;
3330
3331         if (input_name)
3332                 err = trace__replay(&trace);
3333         else
3334                 err = trace__run(&trace, argc, argv);
3335
3336 out_close:
3337         if (output_name != NULL)
3338                 fclose(trace.output);
3339 out:
3340         return err;
3341 }