Merge tag 'docs/v5.3-1' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab...
[linux-2.6-microblaze.git] / tools / perf / builtin-trace.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * builtin-trace.c
4  *
5  * Builtin 'trace' command:
6  *
7  * Display a continuously updated trace of any workload, CPU, specific PID,
8  * system wide, etc.  Default format is loosely strace like, but any other
9  * event may be specified using --event.
10  *
11  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
12  *
13  * Initially based on the 'trace' prototype by Thomas Gleixner:
14  *
15  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
16  */
17
18 #include <traceevent/event-parse.h>
19 #include <api/fs/tracing_path.h>
20 #include <bpf/bpf.h>
21 #include "util/bpf_map.h"
22 #include "builtin.h"
23 #include "util/cgroup.h"
24 #include "util/color.h"
25 #include "util/config.h"
26 #include "util/debug.h"
27 #include "util/env.h"
28 #include "util/event.h"
29 #include "util/evlist.h"
30 #include <subcmd/exec-cmd.h>
31 #include "util/machine.h"
32 #include "util/map.h"
33 #include "util/symbol.h"
34 #include "util/path.h"
35 #include "util/session.h"
36 #include "util/thread.h"
37 #include <subcmd/parse-options.h>
38 #include "util/strlist.h"
39 #include "util/intlist.h"
40 #include "util/thread_map.h"
41 #include "util/stat.h"
42 #include "trace/beauty/beauty.h"
43 #include "trace-event.h"
44 #include "util/parse-events.h"
45 #include "util/bpf-loader.h"
46 #include "callchain.h"
47 #include "print_binary.h"
48 #include "string2.h"
49 #include "syscalltbl.h"
50 #include "rb_resort.h"
51
52 #include <errno.h>
53 #include <inttypes.h>
54 #include <poll.h>
55 #include <signal.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include <linux/err.h>
59 #include <linux/filter.h>
60 #include <linux/kernel.h>
61 #include <linux/random.h>
62 #include <linux/stringify.h>
63 #include <linux/time64.h>
64 #include <linux/zalloc.h>
65 #include <fcntl.h>
66 #include <sys/sysmacros.h>
67
68 #include <linux/ctype.h>
69
70 #ifndef O_CLOEXEC
71 # define O_CLOEXEC              02000000
72 #endif
73
74 #ifndef F_LINUX_SPECIFIC_BASE
75 # define F_LINUX_SPECIFIC_BASE  1024
76 #endif
77
78 struct trace {
79         struct perf_tool        tool;
80         struct syscalltbl       *sctbl;
81         struct {
82                 int             max;
83                 struct syscall  *table;
84                 struct bpf_map  *map;
85                 struct {
86                         struct perf_evsel *sys_enter,
87                                           *sys_exit,
88                                           *augmented;
89                 }               events;
90         } syscalls;
91         struct {
92                 struct bpf_map *map;
93         } dump;
94         struct record_opts      opts;
95         struct perf_evlist      *evlist;
96         struct machine          *host;
97         struct thread           *current;
98         struct cgroup           *cgroup;
99         u64                     base_time;
100         FILE                    *output;
101         unsigned long           nr_events;
102         unsigned long           nr_events_printed;
103         unsigned long           max_events;
104         struct strlist          *ev_qualifier;
105         struct {
106                 size_t          nr;
107                 int             *entries;
108         }                       ev_qualifier_ids;
109         struct {
110                 size_t          nr;
111                 pid_t           *entries;
112                 struct bpf_map  *map;
113         }                       filter_pids;
114         double                  duration_filter;
115         double                  runtime_ms;
116         struct {
117                 u64             vfs_getname,
118                                 proc_getname;
119         } stats;
120         unsigned int            max_stack;
121         unsigned int            min_stack;
122         int                     raw_augmented_syscalls_args_size;
123         bool                    raw_augmented_syscalls;
124         bool                    sort_events;
125         bool                    not_ev_qualifier;
126         bool                    live;
127         bool                    full_time;
128         bool                    sched;
129         bool                    multiple_threads;
130         bool                    summary;
131         bool                    summary_only;
132         bool                    failure_only;
133         bool                    show_comm;
134         bool                    print_sample;
135         bool                    show_tool_stats;
136         bool                    trace_syscalls;
137         bool                    kernel_syscallchains;
138         s16                     args_alignment;
139         bool                    show_tstamp;
140         bool                    show_duration;
141         bool                    show_zeros;
142         bool                    show_arg_names;
143         bool                    show_string_prefix;
144         bool                    force;
145         bool                    vfs_getname;
146         int                     trace_pgfaults;
147         struct {
148                 struct ordered_events   data;
149                 u64                     last;
150         } oe;
151 };
152
153 struct tp_field {
154         int offset;
155         union {
156                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
157                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
158         };
159 };
160
161 #define TP_UINT_FIELD(bits) \
162 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
163 { \
164         u##bits value; \
165         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
166         return value;  \
167 }
168
169 TP_UINT_FIELD(8);
170 TP_UINT_FIELD(16);
171 TP_UINT_FIELD(32);
172 TP_UINT_FIELD(64);
173
174 #define TP_UINT_FIELD__SWAPPED(bits) \
175 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
176 { \
177         u##bits value; \
178         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
179         return bswap_##bits(value);\
180 }
181
182 TP_UINT_FIELD__SWAPPED(16);
183 TP_UINT_FIELD__SWAPPED(32);
184 TP_UINT_FIELD__SWAPPED(64);
185
186 static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
187 {
188         field->offset = offset;
189
190         switch (size) {
191         case 1:
192                 field->integer = tp_field__u8;
193                 break;
194         case 2:
195                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
196                 break;
197         case 4:
198                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
199                 break;
200         case 8:
201                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
202                 break;
203         default:
204                 return -1;
205         }
206
207         return 0;
208 }
209
210 static int tp_field__init_uint(struct tp_field *field, struct tep_format_field *format_field, bool needs_swap)
211 {
212         return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
213 }
214
215 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
216 {
217         return sample->raw_data + field->offset;
218 }
219
220 static int __tp_field__init_ptr(struct tp_field *field, int offset)
221 {
222         field->offset = offset;
223         field->pointer = tp_field__ptr;
224         return 0;
225 }
226
227 static int tp_field__init_ptr(struct tp_field *field, struct tep_format_field *format_field)
228 {
229         return __tp_field__init_ptr(field, format_field->offset);
230 }
231
232 struct syscall_tp {
233         struct tp_field id;
234         union {
235                 struct tp_field args, ret;
236         };
237 };
238
239 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
240                                           struct tp_field *field,
241                                           const char *name)
242 {
243         struct tep_format_field *format_field = perf_evsel__field(evsel, name);
244
245         if (format_field == NULL)
246                 return -1;
247
248         return tp_field__init_uint(field, format_field, evsel->needs_swap);
249 }
250
251 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
252         ({ struct syscall_tp *sc = evsel->priv;\
253            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
254
255 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
256                                          struct tp_field *field,
257                                          const char *name)
258 {
259         struct tep_format_field *format_field = perf_evsel__field(evsel, name);
260
261         if (format_field == NULL)
262                 return -1;
263
264         return tp_field__init_ptr(field, format_field);
265 }
266
267 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
268         ({ struct syscall_tp *sc = evsel->priv;\
269            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
270
271 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
272 {
273         zfree(&evsel->priv);
274         perf_evsel__delete(evsel);
275 }
276
277 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel)
278 {
279         struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
280
281         if (evsel->priv != NULL) {
282                 if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
283                     perf_evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
284                         goto out_delete;
285                 return 0;
286         }
287
288         return -ENOMEM;
289 out_delete:
290         zfree(&evsel->priv);
291         return -ENOENT;
292 }
293
294 static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel, struct perf_evsel *tp)
295 {
296         struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
297
298         if (evsel->priv != NULL) {
299                 struct tep_format_field *syscall_id = perf_evsel__field(tp, "id");
300                 if (syscall_id == NULL)
301                         syscall_id = perf_evsel__field(tp, "__syscall_nr");
302                 if (syscall_id == NULL)
303                         goto out_delete;
304                 if (__tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap))
305                         goto out_delete;
306
307                 return 0;
308         }
309
310         return -ENOMEM;
311 out_delete:
312         zfree(&evsel->priv);
313         return -EINVAL;
314 }
315
316 static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel)
317 {
318         struct syscall_tp *sc = evsel->priv;
319
320         return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
321 }
322
323 static int perf_evsel__init_augmented_syscall_tp_ret(struct perf_evsel *evsel)
324 {
325         struct syscall_tp *sc = evsel->priv;
326
327         return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap);
328 }
329
330 static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler)
331 {
332         evsel->priv = malloc(sizeof(struct syscall_tp));
333         if (evsel->priv != NULL) {
334                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
335                         goto out_delete;
336
337                 evsel->handler = handler;
338                 return 0;
339         }
340
341         return -ENOMEM;
342
343 out_delete:
344         zfree(&evsel->priv);
345         return -ENOENT;
346 }
347
348 static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
349 {
350         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
351
352         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
353         if (IS_ERR(evsel))
354                 evsel = perf_evsel__newtp("syscalls", direction);
355
356         if (IS_ERR(evsel))
357                 return NULL;
358
359         if (perf_evsel__init_raw_syscall_tp(evsel, handler))
360                 goto out_delete;
361
362         return evsel;
363
364 out_delete:
365         perf_evsel__delete_priv(evsel);
366         return NULL;
367 }
368
369 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
370         ({ struct syscall_tp *fields = evsel->priv; \
371            fields->name.integer(&fields->name, sample); })
372
373 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
374         ({ struct syscall_tp *fields = evsel->priv; \
375            fields->name.pointer(&fields->name, sample); })
376
377 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
378 {
379         int idx = val - sa->offset;
380
381         if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) {
382                 size_t printed = scnprintf(bf, size, intfmt, val);
383                 if (show_prefix)
384                         printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix);
385                 return printed;
386         }
387
388         return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
389 }
390
391 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
392                                                 const char *intfmt,
393                                                 struct syscall_arg *arg)
394 {
395         return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->show_string_prefix, arg->val);
396 }
397
398 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
399                                               struct syscall_arg *arg)
400 {
401         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
402 }
403
404 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
405
406 size_t syscall_arg__scnprintf_strarray_flags(char *bf, size_t size, struct syscall_arg *arg)
407 {
408         return strarray__scnprintf_flags(arg->parm, bf, size, arg->show_string_prefix, arg->val);
409 }
410
411 size_t strarrays__scnprintf(struct strarrays *sas, char *bf, size_t size, const char *intfmt, bool show_prefix, int val)
412 {
413         size_t printed;
414         int i;
415
416         for (i = 0; i < sas->nr_entries; ++i) {
417                 struct strarray *sa = sas->entries[i];
418                 int idx = val - sa->offset;
419
420                 if (idx >= 0 && idx < sa->nr_entries) {
421                         if (sa->entries[idx] == NULL)
422                                 break;
423                         return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]);
424                 }
425         }
426
427         printed = scnprintf(bf, size, intfmt, val);
428         if (show_prefix)
429                 printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sas->entries[0]->prefix);
430         return printed;
431 }
432
433 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
434                                         struct syscall_arg *arg)
435 {
436         return strarrays__scnprintf(arg->parm, bf, size, "%d", arg->show_string_prefix, arg->val);
437 }
438
439 #ifndef AT_FDCWD
440 #define AT_FDCWD        -100
441 #endif
442
443 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
444                                            struct syscall_arg *arg)
445 {
446         int fd = arg->val;
447         const char *prefix = "AT_FD";
448
449         if (fd == AT_FDCWD)
450                 return scnprintf(bf, size, "%s%s", arg->show_string_prefix ? prefix : "", "CWD");
451
452         return syscall_arg__scnprintf_fd(bf, size, arg);
453 }
454
455 #define SCA_FDAT syscall_arg__scnprintf_fd_at
456
457 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
458                                               struct syscall_arg *arg);
459
460 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
461
462 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
463 {
464         return scnprintf(bf, size, "%#lx", arg->val);
465 }
466
467 size_t syscall_arg__scnprintf_ptr(char *bf, size_t size, struct syscall_arg *arg)
468 {
469         if (arg->val == 0)
470                 return scnprintf(bf, size, "NULL");
471         return syscall_arg__scnprintf_hex(bf, size, arg);
472 }
473
474 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
475 {
476         return scnprintf(bf, size, "%d", arg->val);
477 }
478
479 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
480 {
481         return scnprintf(bf, size, "%ld", arg->val);
482 }
483
484 static const char *bpf_cmd[] = {
485         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
486         "MAP_GET_NEXT_KEY", "PROG_LOAD",
487 };
488 static DEFINE_STRARRAY(bpf_cmd, "BPF_");
489
490 static const char *fsmount_flags[] = {
491         [1] = "CLOEXEC",
492 };
493 static DEFINE_STRARRAY(fsmount_flags, "FSMOUNT_");
494
495 #include "trace/beauty/generated/fsconfig_arrays.c"
496
497 static DEFINE_STRARRAY(fsconfig_cmds, "FSCONFIG_");
498
499 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
500 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, "EPOLL_CTL_", 1);
501
502 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
503 static DEFINE_STRARRAY(itimers, "ITIMER_");
504
505 static const char *keyctl_options[] = {
506         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
507         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
508         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
509         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
510         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
511 };
512 static DEFINE_STRARRAY(keyctl_options, "KEYCTL_");
513
514 static const char *whences[] = { "SET", "CUR", "END",
515 #ifdef SEEK_DATA
516 "DATA",
517 #endif
518 #ifdef SEEK_HOLE
519 "HOLE",
520 #endif
521 };
522 static DEFINE_STRARRAY(whences, "SEEK_");
523
524 static const char *fcntl_cmds[] = {
525         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
526         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
527         "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
528         "GETOWNER_UIDS",
529 };
530 static DEFINE_STRARRAY(fcntl_cmds, "F_");
531
532 static const char *fcntl_linux_specific_cmds[] = {
533         "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
534         "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
535         "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
536 };
537
538 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, "F_", F_LINUX_SPECIFIC_BASE);
539
540 static struct strarray *fcntl_cmds_arrays[] = {
541         &strarray__fcntl_cmds,
542         &strarray__fcntl_linux_specific_cmds,
543 };
544
545 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
546
547 static const char *rlimit_resources[] = {
548         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
549         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
550         "RTTIME",
551 };
552 static DEFINE_STRARRAY(rlimit_resources, "RLIMIT_");
553
554 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
555 static DEFINE_STRARRAY(sighow, "SIG_");
556
557 static const char *clockid[] = {
558         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
559         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
560         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
561 };
562 static DEFINE_STRARRAY(clockid, "CLOCK_");
563
564 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
565                                                  struct syscall_arg *arg)
566 {
567         bool show_prefix = arg->show_string_prefix;
568         const char *suffix = "_OK";
569         size_t printed = 0;
570         int mode = arg->val;
571
572         if (mode == F_OK) /* 0 */
573                 return scnprintf(bf, size, "F%s", show_prefix ? suffix : "");
574 #define P_MODE(n) \
575         if (mode & n##_OK) { \
576                 printed += scnprintf(bf + printed, size - printed, "%s%s", #n, show_prefix ? suffix : ""); \
577                 mode &= ~n##_OK; \
578         }
579
580         P_MODE(R);
581         P_MODE(W);
582         P_MODE(X);
583 #undef P_MODE
584
585         if (mode)
586                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
587
588         return printed;
589 }
590
591 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
592
593 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
594                                               struct syscall_arg *arg);
595
596 #define SCA_FILENAME syscall_arg__scnprintf_filename
597
598 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
599                                                 struct syscall_arg *arg)
600 {
601         bool show_prefix = arg->show_string_prefix;
602         const char *prefix = "O_";
603         int printed = 0, flags = arg->val;
604
605 #define P_FLAG(n) \
606         if (flags & O_##n) { \
607                 printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
608                 flags &= ~O_##n; \
609         }
610
611         P_FLAG(CLOEXEC);
612         P_FLAG(NONBLOCK);
613 #undef P_FLAG
614
615         if (flags)
616                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
617
618         return printed;
619 }
620
621 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
622
623 #ifndef GRND_NONBLOCK
624 #define GRND_NONBLOCK   0x0001
625 #endif
626 #ifndef GRND_RANDOM
627 #define GRND_RANDOM     0x0002
628 #endif
629
630 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
631                                                    struct syscall_arg *arg)
632 {
633         bool show_prefix = arg->show_string_prefix;
634         const char *prefix = "GRND_";
635         int printed = 0, flags = arg->val;
636
637 #define P_FLAG(n) \
638         if (flags & GRND_##n) { \
639                 printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \
640                 flags &= ~GRND_##n; \
641         }
642
643         P_FLAG(RANDOM);
644         P_FLAG(NONBLOCK);
645 #undef P_FLAG
646
647         if (flags)
648                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
649
650         return printed;
651 }
652
653 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
654
655 #define STRARRAY(name, array) \
656           { .scnprintf  = SCA_STRARRAY, \
657             .parm       = &strarray__##array, }
658
659 #define STRARRAY_FLAGS(name, array) \
660           { .scnprintf  = SCA_STRARRAY_FLAGS, \
661             .parm       = &strarray__##array, }
662
663 #include "trace/beauty/arch_errno_names.c"
664 #include "trace/beauty/eventfd.c"
665 #include "trace/beauty/futex_op.c"
666 #include "trace/beauty/futex_val3.c"
667 #include "trace/beauty/mmap.c"
668 #include "trace/beauty/mode_t.c"
669 #include "trace/beauty/msg_flags.c"
670 #include "trace/beauty/open_flags.c"
671 #include "trace/beauty/perf_event_open.c"
672 #include "trace/beauty/pid.c"
673 #include "trace/beauty/sched_policy.c"
674 #include "trace/beauty/seccomp.c"
675 #include "trace/beauty/signum.c"
676 #include "trace/beauty/socket_type.c"
677 #include "trace/beauty/waitid_options.c"
678
679 struct syscall_arg_fmt {
680         size_t     (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
681         unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val);
682         void       *parm;
683         const char *name;
684         bool       show_zero;
685 };
686
687 static struct syscall_fmt {
688         const char *name;
689         const char *alias;
690         struct syscall_arg_fmt arg[6];
691         u8         nr_args;
692         bool       errpid;
693         bool       timeout;
694         bool       hexret;
695 } syscall_fmts[] = {
696         { .name     = "access",
697           .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
698         { .name     = "arch_prctl",
699           .arg = { [0] = { .scnprintf = SCA_X86_ARCH_PRCTL_CODE, /* code */ },
700                    [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, },
701         { .name     = "bind",
702           .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ }, }, },
703         { .name     = "bpf",
704           .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
705         { .name     = "brk",        .hexret = true,
706           .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, },
707         { .name     = "clock_gettime",
708           .arg = { [0] = STRARRAY(clk_id, clockid), }, },
709         { .name     = "clone",      .errpid = true, .nr_args = 5,
710           .arg = { [0] = { .name = "flags",         .scnprintf = SCA_CLONE_FLAGS, },
711                    [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
712                    [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
713                    [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
714                    [4] = { .name = "tls",           .scnprintf = SCA_HEX, }, }, },
715         { .name     = "close",
716           .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
717         { .name     = "connect",
718           .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* servaddr */ }, }, },
719         { .name     = "epoll_ctl",
720           .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
721         { .name     = "eventfd2",
722           .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
723         { .name     = "fchmodat",
724           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
725         { .name     = "fchownat",
726           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
727         { .name     = "fcntl",
728           .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
729                            .parm      = &strarrays__fcntl_cmds_arrays,
730                            .show_zero = true, },
731                    [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
732         { .name     = "flock",
733           .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
734         { .name     = "fsconfig",
735           .arg = { [1] = STRARRAY(cmd, fsconfig_cmds), }, },
736         { .name     = "fsmount",
737           .arg = { [1] = STRARRAY_FLAGS(flags, fsmount_flags),
738                    [2] = { .scnprintf = SCA_FSMOUNT_ATTR_FLAGS, /* attr_flags */ }, }, },
739         { .name     = "fspick",
740           .arg = { [0] = { .scnprintf = SCA_FDAT,         /* dfd */ },
741                    [1] = { .scnprintf = SCA_FILENAME,     /* path */ },
742                    [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, },
743         { .name     = "fstat", .alias = "newfstat", },
744         { .name     = "fstatat", .alias = "newfstatat", },
745         { .name     = "futex",
746           .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
747                    [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
748         { .name     = "futimesat",
749           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
750         { .name     = "getitimer",
751           .arg = { [0] = STRARRAY(which, itimers), }, },
752         { .name     = "getpid",     .errpid = true, },
753         { .name     = "getpgid",    .errpid = true, },
754         { .name     = "getppid",    .errpid = true, },
755         { .name     = "getrandom",
756           .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
757         { .name     = "getrlimit",
758           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
759         { .name     = "gettid",     .errpid = true, },
760         { .name     = "ioctl",
761           .arg = {
762 #if defined(__i386__) || defined(__x86_64__)
763 /*
764  * FIXME: Make this available to all arches.
765  */
766                    [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
767                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
768 #else
769                    [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
770 #endif
771         { .name     = "kcmp",       .nr_args = 5,
772           .arg = { [0] = { .name = "pid1",      .scnprintf = SCA_PID, },
773                    [1] = { .name = "pid2",      .scnprintf = SCA_PID, },
774                    [2] = { .name = "type",      .scnprintf = SCA_KCMP_TYPE, },
775                    [3] = { .name = "idx1",      .scnprintf = SCA_KCMP_IDX, },
776                    [4] = { .name = "idx2",      .scnprintf = SCA_KCMP_IDX, }, }, },
777         { .name     = "keyctl",
778           .arg = { [0] = STRARRAY(option, keyctl_options), }, },
779         { .name     = "kill",
780           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
781         { .name     = "linkat",
782           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
783         { .name     = "lseek",
784           .arg = { [2] = STRARRAY(whence, whences), }, },
785         { .name     = "lstat", .alias = "newlstat", },
786         { .name     = "madvise",
787           .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
788                    [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
789         { .name     = "mkdirat",
790           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
791         { .name     = "mknodat",
792           .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
793         { .name     = "mmap",       .hexret = true,
794 /* The standard mmap maps to old_mmap on s390x */
795 #if defined(__s390x__)
796         .alias = "old_mmap",
797 #endif
798           .arg = { [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
799                    [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ },
800                    [5] = { .scnprintf = SCA_HEX,        /* offset */ }, }, },
801         { .name     = "mount",
802           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ },
803                    [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */
804                            .mask_val  = SCAMV_MOUNT_FLAGS, /* flags */ }, }, },
805         { .name     = "move_mount",
806           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* from_dfd */ },
807                    [1] = { .scnprintf = SCA_FILENAME, /* from_pathname */ },
808                    [2] = { .scnprintf = SCA_FDAT,       /* to_dfd */ },
809                    [3] = { .scnprintf = SCA_FILENAME, /* to_pathname */ },
810                    [4] = { .scnprintf = SCA_MOVE_MOUNT_FLAGS, /* flags */ }, }, },
811         { .name     = "mprotect",
812           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
813                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ }, }, },
814         { .name     = "mq_unlink",
815           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
816         { .name     = "mremap",     .hexret = true,
817           .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, },
818         { .name     = "name_to_handle_at",
819           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
820         { .name     = "newfstatat",
821           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
822         { .name     = "open",
823           .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
824         { .name     = "open_by_handle_at",
825           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
826                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
827         { .name     = "openat",
828           .arg = { [0] = { .scnprintf = SCA_FDAT,       /* dfd */ },
829                    [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
830         { .name     = "perf_event_open",
831           .arg = { [2] = { .scnprintf = SCA_INT,        /* cpu */ },
832                    [3] = { .scnprintf = SCA_FD,         /* group_fd */ },
833                    [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
834         { .name     = "pipe2",
835           .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
836         { .name     = "pkey_alloc",
837           .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,   /* access_rights */ }, }, },
838         { .name     = "pkey_free",
839           .arg = { [0] = { .scnprintf = SCA_INT,        /* key */ }, }, },
840         { .name     = "pkey_mprotect",
841           .arg = { [0] = { .scnprintf = SCA_HEX,        /* start */ },
842                    [2] = { .scnprintf = SCA_MMAP_PROT,  /* prot */ },
843                    [3] = { .scnprintf = SCA_INT,        /* pkey */ }, }, },
844         { .name     = "poll", .timeout = true, },
845         { .name     = "ppoll", .timeout = true, },
846         { .name     = "prctl",
847           .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
848                    [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
849                    [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
850         { .name     = "pread", .alias = "pread64", },
851         { .name     = "preadv", .alias = "pread", },
852         { .name     = "prlimit64",
853           .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
854         { .name     = "pwrite", .alias = "pwrite64", },
855         { .name     = "readlinkat",
856           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
857         { .name     = "recvfrom",
858           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
859         { .name     = "recvmmsg",
860           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
861         { .name     = "recvmsg",
862           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
863         { .name     = "renameat",
864           .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
865                    [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, }, },
866         { .name     = "renameat2",
867           .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ },
868                    [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ },
869                    [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, },
870         { .name     = "rt_sigaction",
871           .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
872         { .name     = "rt_sigprocmask",
873           .arg = { [0] = STRARRAY(how, sighow), }, },
874         { .name     = "rt_sigqueueinfo",
875           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
876         { .name     = "rt_tgsigqueueinfo",
877           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
878         { .name     = "sched_setscheduler",
879           .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
880         { .name     = "seccomp",
881           .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,    /* op */ },
882                    [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
883         { .name     = "select", .timeout = true, },
884         { .name     = "sendmmsg",
885           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
886         { .name     = "sendmsg",
887           .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
888         { .name     = "sendto",
889           .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ },
890                    [4] = { .scnprintf = SCA_SOCKADDR, /* addr */ }, }, },
891         { .name     = "set_tid_address", .errpid = true, },
892         { .name     = "setitimer",
893           .arg = { [0] = STRARRAY(which, itimers), }, },
894         { .name     = "setrlimit",
895           .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
896         { .name     = "socket",
897           .arg = { [0] = STRARRAY(family, socket_families),
898                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
899                    [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
900         { .name     = "socketpair",
901           .arg = { [0] = STRARRAY(family, socket_families),
902                    [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
903                    [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
904         { .name     = "stat", .alias = "newstat", },
905         { .name     = "statx",
906           .arg = { [0] = { .scnprintf = SCA_FDAT,        /* fdat */ },
907                    [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
908                    [3] = { .scnprintf = SCA_STATX_MASK,  /* mask */ }, }, },
909         { .name     = "swapoff",
910           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
911         { .name     = "swapon",
912           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
913         { .name     = "symlinkat",
914           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
915         { .name     = "sync_file_range",
916           .arg = { [3] = { .scnprintf = SCA_SYNC_FILE_RANGE_FLAGS, /* flags */ }, }, },
917         { .name     = "tgkill",
918           .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
919         { .name     = "tkill",
920           .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
921         { .name     = "umount2", .alias = "umount",
922           .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, },
923         { .name     = "uname", .alias = "newuname", },
924         { .name     = "unlinkat",
925           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
926         { .name     = "utimensat",
927           .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
928         { .name     = "wait4",      .errpid = true,
929           .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
930         { .name     = "waitid",     .errpid = true,
931           .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
932 };
933
934 static int syscall_fmt__cmp(const void *name, const void *fmtp)
935 {
936         const struct syscall_fmt *fmt = fmtp;
937         return strcmp(name, fmt->name);
938 }
939
940 static struct syscall_fmt *syscall_fmt__find(const char *name)
941 {
942         const int nmemb = ARRAY_SIZE(syscall_fmts);
943         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
944 }
945
946 static struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias)
947 {
948         int i, nmemb = ARRAY_SIZE(syscall_fmts);
949
950         for (i = 0; i < nmemb; ++i) {
951                 if (syscall_fmts[i].alias && strcmp(syscall_fmts[i].alias, alias) == 0)
952                         return &syscall_fmts[i];
953         }
954
955         return NULL;
956 }
957
958 /*
959  * is_exit: is this "exit" or "exit_group"?
960  * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
961  * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
962  */
963 struct syscall {
964         struct tep_event    *tp_format;
965         int                 nr_args;
966         int                 args_size;
967         bool                is_exit;
968         bool                is_open;
969         struct tep_format_field *args;
970         const char          *name;
971         struct syscall_fmt  *fmt;
972         struct syscall_arg_fmt *arg_fmt;
973 };
974
975 /*
976  * Must match what is in the BPF program:
977  *
978  * tools/perf/examples/bpf/augmented_raw_syscalls.c
979  */
980 struct bpf_map_syscall_entry {
981         bool    enabled;
982         u16     string_args_len[6];
983 };
984
985 /*
986  * We need to have this 'calculated' boolean because in some cases we really
987  * don't know what is the duration of a syscall, for instance, when we start
988  * a session and some threads are waiting for a syscall to finish, say 'poll',
989  * in which case all we can do is to print "( ? ) for duration and for the
990  * start timestamp.
991  */
992 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
993 {
994         double duration = (double)t / NSEC_PER_MSEC;
995         size_t printed = fprintf(fp, "(");
996
997         if (!calculated)
998                 printed += fprintf(fp, "         ");
999         else if (duration >= 1.0)
1000                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1001         else if (duration >= 0.01)
1002                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1003         else
1004                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1005         return printed + fprintf(fp, "): ");
1006 }
1007
1008 /**
1009  * filename.ptr: The filename char pointer that will be vfs_getname'd
1010  * filename.entry_str_pos: Where to insert the string translated from
1011  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
1012  * ret_scnprintf: syscall args may set this to a different syscall return
1013  *                formatter, for instance, fcntl may return fds, file flags, etc.
1014  */
1015 struct thread_trace {
1016         u64               entry_time;
1017         bool              entry_pending;
1018         unsigned long     nr_events;
1019         unsigned long     pfmaj, pfmin;
1020         char              *entry_str;
1021         double            runtime_ms;
1022         size_t            (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1023         struct {
1024                 unsigned long ptr;
1025                 short int     entry_str_pos;
1026                 bool          pending_open;
1027                 unsigned int  namelen;
1028                 char          *name;
1029         } filename;
1030         struct {
1031                 int           max;
1032                 struct file   *table;
1033         } files;
1034
1035         struct intlist *syscall_stats;
1036 };
1037
1038 static struct thread_trace *thread_trace__new(void)
1039 {
1040         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1041
1042         if (ttrace) {
1043                 ttrace->files.max = -1;
1044                 ttrace->syscall_stats = intlist__new(NULL);
1045         }
1046
1047         return ttrace;
1048 }
1049
1050 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1051 {
1052         struct thread_trace *ttrace;
1053
1054         if (thread == NULL)
1055                 goto fail;
1056
1057         if (thread__priv(thread) == NULL)
1058                 thread__set_priv(thread, thread_trace__new());
1059
1060         if (thread__priv(thread) == NULL)
1061                 goto fail;
1062
1063         ttrace = thread__priv(thread);
1064         ++ttrace->nr_events;
1065
1066         return ttrace;
1067 fail:
1068         color_fprintf(fp, PERF_COLOR_RED,
1069                       "WARNING: not enough memory, dropping samples!\n");
1070         return NULL;
1071 }
1072
1073
1074 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
1075                                     size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
1076 {
1077         struct thread_trace *ttrace = thread__priv(arg->thread);
1078
1079         ttrace->ret_scnprintf = ret_scnprintf;
1080 }
1081
1082 #define TRACE_PFMAJ             (1 << 0)
1083 #define TRACE_PFMIN             (1 << 1)
1084
1085 static const size_t trace__entry_str_size = 2048;
1086
1087 static struct file *thread_trace__files_entry(struct thread_trace *ttrace, int fd)
1088 {
1089         if (fd < 0)
1090                 return NULL;
1091
1092         if (fd > ttrace->files.max) {
1093                 struct file *nfiles = realloc(ttrace->files.table, (fd + 1) * sizeof(struct file));
1094
1095                 if (nfiles == NULL)
1096                         return NULL;
1097
1098                 if (ttrace->files.max != -1) {
1099                         memset(nfiles + ttrace->files.max + 1, 0,
1100                                (fd - ttrace->files.max) * sizeof(struct file));
1101                 } else {
1102                         memset(nfiles, 0, (fd + 1) * sizeof(struct file));
1103                 }
1104
1105                 ttrace->files.table = nfiles;
1106                 ttrace->files.max   = fd;
1107         }
1108
1109         return ttrace->files.table + fd;
1110 }
1111
1112 struct file *thread__files_entry(struct thread *thread, int fd)
1113 {
1114         return thread_trace__files_entry(thread__priv(thread), fd);
1115 }
1116
1117 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1118 {
1119         struct thread_trace *ttrace = thread__priv(thread);
1120         struct file *file = thread_trace__files_entry(ttrace, fd);
1121
1122         if (file != NULL) {
1123                 struct stat st;
1124                 if (stat(pathname, &st) == 0)
1125                         file->dev_maj = major(st.st_rdev);
1126                 file->pathname = strdup(pathname);
1127                 if (file->pathname)
1128                         return 0;
1129         }
1130
1131         return -1;
1132 }
1133
1134 static int thread__read_fd_path(struct thread *thread, int fd)
1135 {
1136         char linkname[PATH_MAX], pathname[PATH_MAX];
1137         struct stat st;
1138         int ret;
1139
1140         if (thread->pid_ == thread->tid) {
1141                 scnprintf(linkname, sizeof(linkname),
1142                           "/proc/%d/fd/%d", thread->pid_, fd);
1143         } else {
1144                 scnprintf(linkname, sizeof(linkname),
1145                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1146         }
1147
1148         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1149                 return -1;
1150
1151         ret = readlink(linkname, pathname, sizeof(pathname));
1152
1153         if (ret < 0 || ret > st.st_size)
1154                 return -1;
1155
1156         pathname[ret] = '\0';
1157         return trace__set_fd_pathname(thread, fd, pathname);
1158 }
1159
1160 static const char *thread__fd_path(struct thread *thread, int fd,
1161                                    struct trace *trace)
1162 {
1163         struct thread_trace *ttrace = thread__priv(thread);
1164
1165         if (ttrace == NULL)
1166                 return NULL;
1167
1168         if (fd < 0)
1169                 return NULL;
1170
1171         if ((fd > ttrace->files.max || ttrace->files.table[fd].pathname == NULL)) {
1172                 if (!trace->live)
1173                         return NULL;
1174                 ++trace->stats.proc_getname;
1175                 if (thread__read_fd_path(thread, fd))
1176                         return NULL;
1177         }
1178
1179         return ttrace->files.table[fd].pathname;
1180 }
1181
1182 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1183 {
1184         int fd = arg->val;
1185         size_t printed = scnprintf(bf, size, "%d", fd);
1186         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1187
1188         if (path)
1189                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1190
1191         return printed;
1192 }
1193
1194 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1195 {
1196         size_t printed = scnprintf(bf, size, "%d", fd);
1197         struct thread *thread = machine__find_thread(trace->host, pid, pid);
1198
1199         if (thread) {
1200                 const char *path = thread__fd_path(thread, fd, trace);
1201
1202                 if (path)
1203                         printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1204
1205                 thread__put(thread);
1206         }
1207
1208         return printed;
1209 }
1210
1211 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1212                                               struct syscall_arg *arg)
1213 {
1214         int fd = arg->val;
1215         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1216         struct thread_trace *ttrace = thread__priv(arg->thread);
1217
1218         if (ttrace && fd >= 0 && fd <= ttrace->files.max)
1219                 zfree(&ttrace->files.table[fd].pathname);
1220
1221         return printed;
1222 }
1223
1224 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1225                                      unsigned long ptr)
1226 {
1227         struct thread_trace *ttrace = thread__priv(thread);
1228
1229         ttrace->filename.ptr = ptr;
1230         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1231 }
1232
1233 static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size)
1234 {
1235         struct augmented_arg *augmented_arg = arg->augmented.args;
1236         size_t printed = scnprintf(bf, size, "\"%.*s\"", augmented_arg->size, augmented_arg->value);
1237         /*
1238          * So that the next arg with a payload can consume its augmented arg, i.e. for rename* syscalls
1239          * we would have two strings, each prefixed by its size.
1240          */
1241         int consumed = sizeof(*augmented_arg) + augmented_arg->size;
1242
1243         arg->augmented.args = ((void *)arg->augmented.args) + consumed;
1244         arg->augmented.size -= consumed;
1245
1246         return printed;
1247 }
1248
1249 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1250                                               struct syscall_arg *arg)
1251 {
1252         unsigned long ptr = arg->val;
1253
1254         if (arg->augmented.args)
1255                 return syscall_arg__scnprintf_augmented_string(arg, bf, size);
1256
1257         if (!arg->trace->vfs_getname)
1258                 return scnprintf(bf, size, "%#x", ptr);
1259
1260         thread__set_filename_pos(arg->thread, bf, ptr);
1261         return 0;
1262 }
1263
1264 static bool trace__filter_duration(struct trace *trace, double t)
1265 {
1266         return t < (trace->duration_filter * NSEC_PER_MSEC);
1267 }
1268
1269 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1270 {
1271         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1272
1273         return fprintf(fp, "%10.3f ", ts);
1274 }
1275
1276 /*
1277  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1278  * using ttrace->entry_time for a thread that receives a sys_exit without
1279  * first having received a sys_enter ("poll" issued before tracing session
1280  * starts, lost sys_enter exit due to ring buffer overflow).
1281  */
1282 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1283 {
1284         if (tstamp > 0)
1285                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1286
1287         return fprintf(fp, "         ? ");
1288 }
1289
1290 static bool done = false;
1291 static bool interrupted = false;
1292
1293 static void sig_handler(int sig)
1294 {
1295         done = true;
1296         interrupted = sig == SIGINT;
1297 }
1298
1299 static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp)
1300 {
1301         size_t printed = 0;
1302
1303         if (trace->multiple_threads) {
1304                 if (trace->show_comm)
1305                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1306                 printed += fprintf(fp, "%d ", thread->tid);
1307         }
1308
1309         return printed;
1310 }
1311
1312 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1313                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1314 {
1315         size_t printed = 0;
1316
1317         if (trace->show_tstamp)
1318                 printed = trace__fprintf_tstamp(trace, tstamp, fp);
1319         if (trace->show_duration)
1320                 printed += fprintf_duration(duration, duration_calculated, fp);
1321         return printed + trace__fprintf_comm_tid(trace, thread, fp);
1322 }
1323
1324 static int trace__process_event(struct trace *trace, struct machine *machine,
1325                                 union perf_event *event, struct perf_sample *sample)
1326 {
1327         int ret = 0;
1328
1329         switch (event->header.type) {
1330         case PERF_RECORD_LOST:
1331                 color_fprintf(trace->output, PERF_COLOR_RED,
1332                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1333                 ret = machine__process_lost_event(machine, event, sample);
1334                 break;
1335         default:
1336                 ret = machine__process_event(machine, event, sample);
1337                 break;
1338         }
1339
1340         return ret;
1341 }
1342
1343 static int trace__tool_process(struct perf_tool *tool,
1344                                union perf_event *event,
1345                                struct perf_sample *sample,
1346                                struct machine *machine)
1347 {
1348         struct trace *trace = container_of(tool, struct trace, tool);
1349         return trace__process_event(trace, machine, event, sample);
1350 }
1351
1352 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1353 {
1354         struct machine *machine = vmachine;
1355
1356         if (machine->kptr_restrict_warned)
1357                 return NULL;
1358
1359         if (symbol_conf.kptr_restrict) {
1360                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1361                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1362                            "Kernel samples will not be resolved.\n");
1363                 machine->kptr_restrict_warned = true;
1364                 return NULL;
1365         }
1366
1367         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1368 }
1369
1370 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1371 {
1372         int err = symbol__init(NULL);
1373
1374         if (err)
1375                 return err;
1376
1377         trace->host = machine__new_host();
1378         if (trace->host == NULL)
1379                 return -ENOMEM;
1380
1381         err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1382         if (err < 0)
1383                 goto out;
1384
1385         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1386                                             evlist->threads, trace__tool_process, false,
1387                                             1);
1388 out:
1389         if (err)
1390                 symbol__exit();
1391
1392         return err;
1393 }
1394
1395 static void trace__symbols__exit(struct trace *trace)
1396 {
1397         machine__exit(trace->host);
1398         trace->host = NULL;
1399
1400         symbol__exit();
1401 }
1402
1403 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1404 {
1405         int idx;
1406
1407         if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1408                 nr_args = sc->fmt->nr_args;
1409
1410         sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1411         if (sc->arg_fmt == NULL)
1412                 return -1;
1413
1414         for (idx = 0; idx < nr_args; ++idx) {
1415                 if (sc->fmt)
1416                         sc->arg_fmt[idx] = sc->fmt->arg[idx];
1417         }
1418
1419         sc->nr_args = nr_args;
1420         return 0;
1421 }
1422
1423 static int syscall__set_arg_fmts(struct syscall *sc)
1424 {
1425         struct tep_format_field *field, *last_field = NULL;
1426         int idx = 0, len;
1427
1428         for (field = sc->args; field; field = field->next, ++idx) {
1429                 last_field = field;
1430
1431                 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1432                         continue;
1433
1434                 len = strlen(field->name);
1435
1436                 if (strcmp(field->type, "const char *") == 0 &&
1437                     ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) ||
1438                      strstr(field->name, "path") != NULL))
1439                         sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1440                 else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr"))
1441                         sc->arg_fmt[idx].scnprintf = SCA_PTR;
1442                 else if (strcmp(field->type, "pid_t") == 0)
1443                         sc->arg_fmt[idx].scnprintf = SCA_PID;
1444                 else if (strcmp(field->type, "umode_t") == 0)
1445                         sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1446                 else if ((strcmp(field->type, "int") == 0 ||
1447                           strcmp(field->type, "unsigned int") == 0 ||
1448                           strcmp(field->type, "long") == 0) &&
1449                          len >= 2 && strcmp(field->name + len - 2, "fd") == 0) {
1450                         /*
1451                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1452                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1453                          * 65 int
1454                          * 23 unsigned int
1455                          * 7 unsigned long
1456                          */
1457                         sc->arg_fmt[idx].scnprintf = SCA_FD;
1458                 }
1459         }
1460
1461         if (last_field)
1462                 sc->args_size = last_field->offset + last_field->size;
1463
1464         return 0;
1465 }
1466
1467 static int trace__read_syscall_info(struct trace *trace, int id)
1468 {
1469         char tp_name[128];
1470         struct syscall *sc;
1471         const char *name = syscalltbl__name(trace->sctbl, id);
1472
1473         if (name == NULL)
1474                 return -1;
1475
1476         if (id > trace->syscalls.max) {
1477                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1478
1479                 if (nsyscalls == NULL)
1480                         return -1;
1481
1482                 if (trace->syscalls.max != -1) {
1483                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1484                                (id - trace->syscalls.max) * sizeof(*sc));
1485                 } else {
1486                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1487                 }
1488
1489                 trace->syscalls.table = nsyscalls;
1490                 trace->syscalls.max   = id;
1491         }
1492
1493         sc = trace->syscalls.table + id;
1494         sc->name = name;
1495
1496         sc->fmt  = syscall_fmt__find(sc->name);
1497
1498         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1499         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1500
1501         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1502                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1503                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1504         }
1505
1506         if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1507                 return -1;
1508
1509         if (IS_ERR(sc->tp_format))
1510                 return -1;
1511
1512         sc->args = sc->tp_format->format.fields;
1513         /*
1514          * We need to check and discard the first variable '__syscall_nr'
1515          * or 'nr' that mean the syscall number. It is needless here.
1516          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1517          */
1518         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1519                 sc->args = sc->args->next;
1520                 --sc->nr_args;
1521         }
1522
1523         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1524         sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1525
1526         return syscall__set_arg_fmts(sc);
1527 }
1528
1529 static int trace__validate_ev_qualifier(struct trace *trace)
1530 {
1531         int err = 0;
1532         bool printed_invalid_prefix = false;
1533         struct str_node *pos;
1534         size_t nr_used = 0, nr_allocated = strlist__nr_entries(trace->ev_qualifier);
1535
1536         trace->ev_qualifier_ids.entries = malloc(nr_allocated *
1537                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1538
1539         if (trace->ev_qualifier_ids.entries == NULL) {
1540                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1541                        trace->output);
1542                 err = -EINVAL;
1543                 goto out;
1544         }
1545
1546         strlist__for_each_entry(pos, trace->ev_qualifier) {
1547                 const char *sc = pos->s;
1548                 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1549
1550                 if (id < 0) {
1551                         id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1552                         if (id >= 0)
1553                                 goto matches;
1554
1555                         if (!printed_invalid_prefix) {
1556                                 pr_debug("Skipping unknown syscalls: ");
1557                                 printed_invalid_prefix = true;
1558                         } else {
1559                                 pr_debug(", ");
1560                         }
1561
1562                         pr_debug("%s", sc);
1563                         continue;
1564                 }
1565 matches:
1566                 trace->ev_qualifier_ids.entries[nr_used++] = id;
1567                 if (match_next == -1)
1568                         continue;
1569
1570                 while (1) {
1571                         id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1572                         if (id < 0)
1573                                 break;
1574                         if (nr_allocated == nr_used) {
1575                                 void *entries;
1576
1577                                 nr_allocated += 8;
1578                                 entries = realloc(trace->ev_qualifier_ids.entries,
1579                                                   nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1580                                 if (entries == NULL) {
1581                                         err = -ENOMEM;
1582                                         fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1583                                         goto out_free;
1584                                 }
1585                                 trace->ev_qualifier_ids.entries = entries;
1586                         }
1587                         trace->ev_qualifier_ids.entries[nr_used++] = id;
1588                 }
1589         }
1590
1591         trace->ev_qualifier_ids.nr = nr_used;
1592 out:
1593         if (printed_invalid_prefix)
1594                 pr_debug("\n");
1595         return err;
1596 out_free:
1597         zfree(&trace->ev_qualifier_ids.entries);
1598         trace->ev_qualifier_ids.nr = 0;
1599         goto out;
1600 }
1601
1602 /*
1603  * args is to be interpreted as a series of longs but we need to handle
1604  * 8-byte unaligned accesses. args points to raw_data within the event
1605  * and raw_data is guaranteed to be 8-byte unaligned because it is
1606  * preceded by raw_size which is a u32. So we need to copy args to a temp
1607  * variable to read it. Most notably this avoids extended load instructions
1608  * on unaligned addresses
1609  */
1610 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1611 {
1612         unsigned long val;
1613         unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1614
1615         memcpy(&val, p, sizeof(val));
1616         return val;
1617 }
1618
1619 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1620                                       struct syscall_arg *arg)
1621 {
1622         if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1623                 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1624
1625         return scnprintf(bf, size, "arg%d: ", arg->idx);
1626 }
1627
1628 /*
1629  * Check if the value is in fact zero, i.e. mask whatever needs masking, such
1630  * as mount 'flags' argument that needs ignoring some magic flag, see comment
1631  * in tools/perf/trace/beauty/mount_flags.c
1632  */
1633 static unsigned long syscall__mask_val(struct syscall *sc, struct syscall_arg *arg, unsigned long val)
1634 {
1635         if (sc->arg_fmt && sc->arg_fmt[arg->idx].mask_val)
1636                 return sc->arg_fmt[arg->idx].mask_val(arg, val);
1637
1638         return val;
1639 }
1640
1641 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1642                                      struct syscall_arg *arg, unsigned long val)
1643 {
1644         if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1645                 arg->val = val;
1646                 if (sc->arg_fmt[arg->idx].parm)
1647                         arg->parm = sc->arg_fmt[arg->idx].parm;
1648                 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1649         }
1650         return scnprintf(bf, size, "%ld", val);
1651 }
1652
1653 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1654                                       unsigned char *args, void *augmented_args, int augmented_args_size,
1655                                       struct trace *trace, struct thread *thread)
1656 {
1657         size_t printed = 0;
1658         unsigned long val;
1659         u8 bit = 1;
1660         struct syscall_arg arg = {
1661                 .args   = args,
1662                 .augmented = {
1663                         .size = augmented_args_size,
1664                         .args = augmented_args,
1665                 },
1666                 .idx    = 0,
1667                 .mask   = 0,
1668                 .trace  = trace,
1669                 .thread = thread,
1670                 .show_string_prefix = trace->show_string_prefix,
1671         };
1672         struct thread_trace *ttrace = thread__priv(thread);
1673
1674         /*
1675          * Things like fcntl will set this in its 'cmd' formatter to pick the
1676          * right formatter for the return value (an fd? file flags?), which is
1677          * not needed for syscalls that always return a given type, say an fd.
1678          */
1679         ttrace->ret_scnprintf = NULL;
1680
1681         if (sc->args != NULL) {
1682                 struct tep_format_field *field;
1683
1684                 for (field = sc->args; field;
1685                      field = field->next, ++arg.idx, bit <<= 1) {
1686                         if (arg.mask & bit)
1687                                 continue;
1688
1689                         val = syscall_arg__val(&arg, arg.idx);
1690                         /*
1691                          * Some syscall args need some mask, most don't and
1692                          * return val untouched.
1693                          */
1694                         val = syscall__mask_val(sc, &arg, val);
1695
1696                         /*
1697                          * Suppress this argument if its value is zero and
1698                          * and we don't have a string associated in an
1699                          * strarray for it.
1700                          */
1701                         if (val == 0 &&
1702                             !trace->show_zeros &&
1703                             !(sc->arg_fmt &&
1704                               (sc->arg_fmt[arg.idx].show_zero ||
1705                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1706                                sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1707                               sc->arg_fmt[arg.idx].parm))
1708                                 continue;
1709
1710                         printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
1711
1712                         if (trace->show_arg_names)
1713                                 printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
1714
1715                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1716                 }
1717         } else if (IS_ERR(sc->tp_format)) {
1718                 /*
1719                  * If we managed to read the tracepoint /format file, then we
1720                  * may end up not having any args, like with gettid(), so only
1721                  * print the raw args when we didn't manage to read it.
1722                  */
1723                 while (arg.idx < sc->nr_args) {
1724                         if (arg.mask & bit)
1725                                 goto next_arg;
1726                         val = syscall_arg__val(&arg, arg.idx);
1727                         if (printed)
1728                                 printed += scnprintf(bf + printed, size - printed, ", ");
1729                         printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1730                         printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1731 next_arg:
1732                         ++arg.idx;
1733                         bit <<= 1;
1734                 }
1735         }
1736
1737         return printed;
1738 }
1739
1740 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1741                                   union perf_event *event,
1742                                   struct perf_sample *sample);
1743
1744 static struct syscall *trace__syscall_info(struct trace *trace,
1745                                            struct perf_evsel *evsel, int id)
1746 {
1747
1748         if (id < 0) {
1749
1750                 /*
1751                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1752                  * before that, leaving at a higher verbosity level till that is
1753                  * explained. Reproduced with plain ftrace with:
1754                  *
1755                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1756                  * grep "NR -1 " /t/trace_pipe
1757                  *
1758                  * After generating some load on the machine.
1759                  */
1760                 if (verbose > 1) {
1761                         static u64 n;
1762                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1763                                 id, perf_evsel__name(evsel), ++n);
1764                 }
1765                 return NULL;
1766         }
1767
1768         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1769             trace__read_syscall_info(trace, id))
1770                 goto out_cant_read;
1771
1772         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1773                 goto out_cant_read;
1774
1775         return &trace->syscalls.table[id];
1776
1777 out_cant_read:
1778         if (verbose > 0) {
1779                 fprintf(trace->output, "Problems reading syscall %d", id);
1780                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1781                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1782                 fputs(" information\n", trace->output);
1783         }
1784         return NULL;
1785 }
1786
1787 static void thread__update_stats(struct thread_trace *ttrace,
1788                                  int id, struct perf_sample *sample)
1789 {
1790         struct int_node *inode;
1791         struct stats *stats;
1792         u64 duration = 0;
1793
1794         inode = intlist__findnew(ttrace->syscall_stats, id);
1795         if (inode == NULL)
1796                 return;
1797
1798         stats = inode->priv;
1799         if (stats == NULL) {
1800                 stats = malloc(sizeof(struct stats));
1801                 if (stats == NULL)
1802                         return;
1803                 init_stats(stats);
1804                 inode->priv = stats;
1805         }
1806
1807         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1808                 duration = sample->time - ttrace->entry_time;
1809
1810         update_stats(stats, duration);
1811 }
1812
1813 static int trace__printf_interrupted_entry(struct trace *trace)
1814 {
1815         struct thread_trace *ttrace;
1816         size_t printed;
1817         int len;
1818
1819         if (trace->failure_only || trace->current == NULL)
1820                 return 0;
1821
1822         ttrace = thread__priv(trace->current);
1823
1824         if (!ttrace->entry_pending)
1825                 return 0;
1826
1827         printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1828         printed += len = fprintf(trace->output, "%s)", ttrace->entry_str);
1829
1830         if (len < trace->args_alignment - 4)
1831                 printed += fprintf(trace->output, "%-*s", trace->args_alignment - 4 - len, " ");
1832
1833         printed += fprintf(trace->output, " ...\n");
1834
1835         ttrace->entry_pending = false;
1836         ++trace->nr_events_printed;
1837
1838         return printed;
1839 }
1840
1841 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1842                                  struct perf_sample *sample, struct thread *thread)
1843 {
1844         int printed = 0;
1845
1846         if (trace->print_sample) {
1847                 double ts = (double)sample->time / NSEC_PER_MSEC;
1848
1849                 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1850                                    perf_evsel__name(evsel), ts,
1851                                    thread__comm_str(thread),
1852                                    sample->pid, sample->tid, sample->cpu);
1853         }
1854
1855         return printed;
1856 }
1857
1858 static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size)
1859 {
1860         void *augmented_args = NULL;
1861         /*
1862          * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter
1863          * and there we get all 6 syscall args plus the tracepoint common fields
1864          * that gets calculated at the start and the syscall_nr (another long).
1865          * So we check if that is the case and if so don't look after the
1866          * sc->args_size but always after the full raw_syscalls:sys_enter payload,
1867          * which is fixed.
1868          *
1869          * We'll revisit this later to pass s->args_size to the BPF augmenter
1870          * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
1871          * copies only what we need for each syscall, like what happens when we
1872          * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace
1873          * traffic to just what is needed for each syscall.
1874          */
1875         int args_size = raw_augmented_args_size ?: sc->args_size;
1876
1877         *augmented_args_size = sample->raw_size - args_size;
1878         if (*augmented_args_size > 0)
1879                 augmented_args = sample->raw_data + args_size;
1880
1881         return augmented_args;
1882 }
1883
1884 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1885                             union perf_event *event __maybe_unused,
1886                             struct perf_sample *sample)
1887 {
1888         char *msg;
1889         void *args;
1890         int printed = 0;
1891         struct thread *thread;
1892         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1893         int augmented_args_size = 0;
1894         void *augmented_args = NULL;
1895         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1896         struct thread_trace *ttrace;
1897
1898         if (sc == NULL)
1899                 return -1;
1900
1901         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1902         ttrace = thread__trace(thread, trace->output);
1903         if (ttrace == NULL)
1904                 goto out_put;
1905
1906         trace__fprintf_sample(trace, evsel, sample, thread);
1907
1908         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1909
1910         if (ttrace->entry_str == NULL) {
1911                 ttrace->entry_str = malloc(trace__entry_str_size);
1912                 if (!ttrace->entry_str)
1913                         goto out_put;
1914         }
1915
1916         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1917                 trace__printf_interrupted_entry(trace);
1918         /*
1919          * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible
1920          * arguments, even if the syscall being handled, say "openat", uses only 4 arguments
1921          * this breaks syscall__augmented_args() check for augmented args, as we calculate
1922          * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file,
1923          * so when handling, say the openat syscall, we end up getting 6 args for the
1924          * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly
1925          * thinking that the extra 2 u64 args are the augmented filename, so just check
1926          * here and avoid using augmented syscalls when the evsel is the raw_syscalls one.
1927          */
1928         if (evsel != trace->syscalls.events.sys_enter)
1929                 augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
1930         ttrace->entry_time = sample->time;
1931         msg = ttrace->entry_str;
1932         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1933
1934         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1935                                            args, augmented_args, augmented_args_size, trace, thread);
1936
1937         if (sc->is_exit) {
1938                 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1939                         int alignment = 0;
1940
1941                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1942                         printed = fprintf(trace->output, "%s)", ttrace->entry_str);
1943                         if (trace->args_alignment > printed)
1944                                 alignment = trace->args_alignment - printed;
1945                         fprintf(trace->output, "%*s= ?\n", alignment, " ");
1946                 }
1947         } else {
1948                 ttrace->entry_pending = true;
1949                 /* See trace__vfs_getname & trace__sys_exit */
1950                 ttrace->filename.pending_open = false;
1951         }
1952
1953         if (trace->current != thread) {
1954                 thread__put(trace->current);
1955                 trace->current = thread__get(thread);
1956         }
1957         err = 0;
1958 out_put:
1959         thread__put(thread);
1960         return err;
1961 }
1962
1963 static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel,
1964                                     struct perf_sample *sample)
1965 {
1966         struct thread_trace *ttrace;
1967         struct thread *thread;
1968         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1969         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1970         char msg[1024];
1971         void *args, *augmented_args = NULL;
1972         int augmented_args_size;
1973
1974         if (sc == NULL)
1975                 return -1;
1976
1977         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1978         ttrace = thread__trace(thread, trace->output);
1979         /*
1980          * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
1981          * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
1982          */
1983         if (ttrace == NULL)
1984                 goto out_put;
1985
1986         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1987         augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size);
1988         syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
1989         fprintf(trace->output, "%s", msg);
1990         err = 0;
1991 out_put:
1992         thread__put(thread);
1993         return err;
1994 }
1995
1996 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1997                                     struct perf_sample *sample,
1998                                     struct callchain_cursor *cursor)
1999 {
2000         struct addr_location al;
2001         int max_stack = evsel->attr.sample_max_stack ?
2002                         evsel->attr.sample_max_stack :
2003                         trace->max_stack;
2004         int err;
2005
2006         if (machine__resolve(trace->host, &al, sample) < 0)
2007                 return -1;
2008
2009         err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack);
2010         addr_location__put(&al);
2011         return err;
2012 }
2013
2014 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
2015 {
2016         /* TODO: user-configurable print_opts */
2017         const unsigned int print_opts = EVSEL__PRINT_SYM |
2018                                         EVSEL__PRINT_DSO |
2019                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
2020
2021         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
2022 }
2023
2024 static const char *errno_to_name(struct perf_evsel *evsel, int err)
2025 {
2026         struct perf_env *env = perf_evsel__env(evsel);
2027         const char *arch_name = perf_env__arch(env);
2028
2029         return arch_syscalls__strerrno(arch_name, err);
2030 }
2031
2032 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
2033                            union perf_event *event __maybe_unused,
2034                            struct perf_sample *sample)
2035 {
2036         long ret;
2037         u64 duration = 0;
2038         bool duration_calculated = false;
2039         struct thread *thread;
2040         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0;
2041         int alignment = trace->args_alignment;
2042         struct syscall *sc = trace__syscall_info(trace, evsel, id);
2043         struct thread_trace *ttrace;
2044
2045         if (sc == NULL)
2046                 return -1;
2047
2048         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2049         ttrace = thread__trace(thread, trace->output);
2050         if (ttrace == NULL)
2051                 goto out_put;
2052
2053         trace__fprintf_sample(trace, evsel, sample, thread);
2054
2055         if (trace->summary)
2056                 thread__update_stats(ttrace, id, sample);
2057
2058         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2059
2060         if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
2061                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2062                 ttrace->filename.pending_open = false;
2063                 ++trace->stats.vfs_getname;
2064         }
2065
2066         if (ttrace->entry_time) {
2067                 duration = sample->time - ttrace->entry_time;
2068                 if (trace__filter_duration(trace, duration))
2069                         goto out;
2070                 duration_calculated = true;
2071         } else if (trace->duration_filter)
2072                 goto out;
2073
2074         if (sample->callchain) {
2075                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2076                 if (callchain_ret == 0) {
2077                         if (callchain_cursor.nr < trace->min_stack)
2078                                 goto out;
2079                         callchain_ret = 1;
2080                 }
2081         }
2082
2083         if (trace->summary_only || (ret >= 0 && trace->failure_only))
2084                 goto out;
2085
2086         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
2087
2088         if (ttrace->entry_pending) {
2089                 printed = fprintf(trace->output, "%s", ttrace->entry_str);
2090         } else {
2091                 printed += fprintf(trace->output, " ... [");
2092                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2093                 printed += 9;
2094                 printed += fprintf(trace->output, "]: %s()", sc->name);
2095         }
2096
2097         printed++; /* the closing ')' */
2098
2099         if (alignment > printed)
2100                 alignment -= printed;
2101         else
2102                 alignment = 0;
2103
2104         fprintf(trace->output, ")%*s= ", alignment, " ");
2105
2106         if (sc->fmt == NULL) {
2107                 if (ret < 0)
2108                         goto errno_print;
2109 signed_print:
2110                 fprintf(trace->output, "%ld", ret);
2111         } else if (ret < 0) {
2112 errno_print: {
2113                 char bf[STRERR_BUFSIZE];
2114                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
2115                            *e = errno_to_name(evsel, -ret);
2116
2117                 fprintf(trace->output, "-1 %s (%s)", e, emsg);
2118         }
2119         } else if (ret == 0 && sc->fmt->timeout)
2120                 fprintf(trace->output, "0 (Timeout)");
2121         else if (ttrace->ret_scnprintf) {
2122                 char bf[1024];
2123                 struct syscall_arg arg = {
2124                         .val    = ret,
2125                         .thread = thread,
2126                         .trace  = trace,
2127                 };
2128                 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
2129                 ttrace->ret_scnprintf = NULL;
2130                 fprintf(trace->output, "%s", bf);
2131         } else if (sc->fmt->hexret)
2132                 fprintf(trace->output, "%#lx", ret);
2133         else if (sc->fmt->errpid) {
2134                 struct thread *child = machine__find_thread(trace->host, ret, ret);
2135
2136                 if (child != NULL) {
2137                         fprintf(trace->output, "%ld", ret);
2138                         if (child->comm_set)
2139                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
2140                         thread__put(child);
2141                 }
2142         } else
2143                 goto signed_print;
2144
2145         fputc('\n', trace->output);
2146
2147         /*
2148          * We only consider an 'event' for the sake of --max-events a non-filtered
2149          * sys_enter + sys_exit and other tracepoint events.
2150          */
2151         if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX)
2152                 interrupted = true;
2153
2154         if (callchain_ret > 0)
2155                 trace__fprintf_callchain(trace, sample);
2156         else if (callchain_ret < 0)
2157                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2158 out:
2159         ttrace->entry_pending = false;
2160         err = 0;
2161 out_put:
2162         thread__put(thread);
2163         return err;
2164 }
2165
2166 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
2167                               union perf_event *event __maybe_unused,
2168                               struct perf_sample *sample)
2169 {
2170         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2171         struct thread_trace *ttrace;
2172         size_t filename_len, entry_str_len, to_move;
2173         ssize_t remaining_space;
2174         char *pos;
2175         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
2176
2177         if (!thread)
2178                 goto out;
2179
2180         ttrace = thread__priv(thread);
2181         if (!ttrace)
2182                 goto out_put;
2183
2184         filename_len = strlen(filename);
2185         if (filename_len == 0)
2186                 goto out_put;
2187
2188         if (ttrace->filename.namelen < filename_len) {
2189                 char *f = realloc(ttrace->filename.name, filename_len + 1);
2190
2191                 if (f == NULL)
2192                         goto out_put;
2193
2194                 ttrace->filename.namelen = filename_len;
2195                 ttrace->filename.name = f;
2196         }
2197
2198         strcpy(ttrace->filename.name, filename);
2199         ttrace->filename.pending_open = true;
2200
2201         if (!ttrace->filename.ptr)
2202                 goto out_put;
2203
2204         entry_str_len = strlen(ttrace->entry_str);
2205         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2206         if (remaining_space <= 0)
2207                 goto out_put;
2208
2209         if (filename_len > (size_t)remaining_space) {
2210                 filename += filename_len - remaining_space;
2211                 filename_len = remaining_space;
2212         }
2213
2214         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2215         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2216         memmove(pos + filename_len, pos, to_move);
2217         memcpy(pos, filename, filename_len);
2218
2219         ttrace->filename.ptr = 0;
2220         ttrace->filename.entry_str_pos = 0;
2221 out_put:
2222         thread__put(thread);
2223 out:
2224         return 0;
2225 }
2226
2227 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
2228                                      union perf_event *event __maybe_unused,
2229                                      struct perf_sample *sample)
2230 {
2231         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
2232         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2233         struct thread *thread = machine__findnew_thread(trace->host,
2234                                                         sample->pid,
2235                                                         sample->tid);
2236         struct thread_trace *ttrace = thread__trace(thread, trace->output);
2237
2238         if (ttrace == NULL)
2239                 goto out_dump;
2240
2241         ttrace->runtime_ms += runtime_ms;
2242         trace->runtime_ms += runtime_ms;
2243 out_put:
2244         thread__put(thread);
2245         return 0;
2246
2247 out_dump:
2248         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2249                evsel->name,
2250                perf_evsel__strval(evsel, sample, "comm"),
2251                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
2252                runtime,
2253                perf_evsel__intval(evsel, sample, "vruntime"));
2254         goto out_put;
2255 }
2256
2257 static int bpf_output__printer(enum binary_printer_ops op,
2258                                unsigned int val, void *extra __maybe_unused, FILE *fp)
2259 {
2260         unsigned char ch = (unsigned char)val;
2261
2262         switch (op) {
2263         case BINARY_PRINT_CHAR_DATA:
2264                 return fprintf(fp, "%c", isprint(ch) ? ch : '.');
2265         case BINARY_PRINT_DATA_BEGIN:
2266         case BINARY_PRINT_LINE_BEGIN:
2267         case BINARY_PRINT_ADDR:
2268         case BINARY_PRINT_NUM_DATA:
2269         case BINARY_PRINT_NUM_PAD:
2270         case BINARY_PRINT_SEP:
2271         case BINARY_PRINT_CHAR_PAD:
2272         case BINARY_PRINT_LINE_END:
2273         case BINARY_PRINT_DATA_END:
2274         default:
2275                 break;
2276         }
2277
2278         return 0;
2279 }
2280
2281 static void bpf_output__fprintf(struct trace *trace,
2282                                 struct perf_sample *sample)
2283 {
2284         binary__fprintf(sample->raw_data, sample->raw_size, 8,
2285                         bpf_output__printer, NULL, trace->output);
2286         ++trace->nr_events_printed;
2287 }
2288
2289 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2290                                 union perf_event *event __maybe_unused,
2291                                 struct perf_sample *sample)
2292 {
2293         struct thread *thread;
2294         int callchain_ret = 0;
2295         /*
2296          * Check if we called perf_evsel__disable(evsel) due to, for instance,
2297          * this event's max_events having been hit and this is an entry coming
2298          * from the ring buffer that we should discard, since the max events
2299          * have already been considered/printed.
2300          */
2301         if (evsel->disabled)
2302                 return 0;
2303
2304         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2305
2306         if (sample->callchain) {
2307                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2308                 if (callchain_ret == 0) {
2309                         if (callchain_cursor.nr < trace->min_stack)
2310                                 goto out;
2311                         callchain_ret = 1;
2312                 }
2313         }
2314
2315         trace__printf_interrupted_entry(trace);
2316         trace__fprintf_tstamp(trace, sample->time, trace->output);
2317
2318         if (trace->trace_syscalls && trace->show_duration)
2319                 fprintf(trace->output, "(         ): ");
2320
2321         if (thread)
2322                 trace__fprintf_comm_tid(trace, thread, trace->output);
2323
2324         if (evsel == trace->syscalls.events.augmented) {
2325                 int id = perf_evsel__sc_tp_uint(evsel, id, sample);
2326                 struct syscall *sc = trace__syscall_info(trace, evsel, id);
2327
2328                 if (sc) {
2329                         fprintf(trace->output, "%s(", sc->name);
2330                         trace__fprintf_sys_enter(trace, evsel, sample);
2331                         fputc(')', trace->output);
2332                         goto newline;
2333                 }
2334
2335                 /*
2336                  * XXX: Not having the associated syscall info or not finding/adding
2337                  *      the thread should never happen, but if it does...
2338                  *      fall thru and print it as a bpf_output event.
2339                  */
2340         }
2341
2342         fprintf(trace->output, "%s:", evsel->name);
2343
2344         if (perf_evsel__is_bpf_output(evsel)) {
2345                 bpf_output__fprintf(trace, sample);
2346         } else if (evsel->tp_format) {
2347                 if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
2348                     trace__fprintf_sys_enter(trace, evsel, sample)) {
2349                         event_format__fprintf(evsel->tp_format, sample->cpu,
2350                                               sample->raw_data, sample->raw_size,
2351                                               trace->output);
2352                         ++trace->nr_events_printed;
2353
2354                         if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) {
2355                                 perf_evsel__disable(evsel);
2356                                 perf_evsel__close(evsel);
2357                         }
2358                 }
2359         }
2360
2361 newline:
2362         fprintf(trace->output, "\n");
2363
2364         if (callchain_ret > 0)
2365                 trace__fprintf_callchain(trace, sample);
2366         else if (callchain_ret < 0)
2367                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2368 out:
2369         thread__put(thread);
2370         return 0;
2371 }
2372
2373 static void print_location(FILE *f, struct perf_sample *sample,
2374                            struct addr_location *al,
2375                            bool print_dso, bool print_sym)
2376 {
2377
2378         if ((verbose > 0 || print_dso) && al->map)
2379                 fprintf(f, "%s@", al->map->dso->long_name);
2380
2381         if ((verbose > 0 || print_sym) && al->sym)
2382                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2383                         al->addr - al->sym->start);
2384         else if (al->map)
2385                 fprintf(f, "0x%" PRIx64, al->addr);
2386         else
2387                 fprintf(f, "0x%" PRIx64, sample->addr);
2388 }
2389
2390 static int trace__pgfault(struct trace *trace,
2391                           struct perf_evsel *evsel,
2392                           union perf_event *event __maybe_unused,
2393                           struct perf_sample *sample)
2394 {
2395         struct thread *thread;
2396         struct addr_location al;
2397         char map_type = 'd';
2398         struct thread_trace *ttrace;
2399         int err = -1;
2400         int callchain_ret = 0;
2401
2402         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2403
2404         if (sample->callchain) {
2405                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2406                 if (callchain_ret == 0) {
2407                         if (callchain_cursor.nr < trace->min_stack)
2408                                 goto out_put;
2409                         callchain_ret = 1;
2410                 }
2411         }
2412
2413         ttrace = thread__trace(thread, trace->output);
2414         if (ttrace == NULL)
2415                 goto out_put;
2416
2417         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2418                 ttrace->pfmaj++;
2419         else
2420                 ttrace->pfmin++;
2421
2422         if (trace->summary_only)
2423                 goto out;
2424
2425         thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2426
2427         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2428
2429         fprintf(trace->output, "%sfault [",
2430                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2431                 "maj" : "min");
2432
2433         print_location(trace->output, sample, &al, false, true);
2434
2435         fprintf(trace->output, "] => ");
2436
2437         thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2438
2439         if (!al.map) {
2440                 thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2441
2442                 if (al.map)
2443                         map_type = 'x';
2444                 else
2445                         map_type = '?';
2446         }
2447
2448         print_location(trace->output, sample, &al, true, false);
2449
2450         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2451
2452         if (callchain_ret > 0)
2453                 trace__fprintf_callchain(trace, sample);
2454         else if (callchain_ret < 0)
2455                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2456
2457         ++trace->nr_events_printed;
2458 out:
2459         err = 0;
2460 out_put:
2461         thread__put(thread);
2462         return err;
2463 }
2464
2465 static void trace__set_base_time(struct trace *trace,
2466                                  struct perf_evsel *evsel,
2467                                  struct perf_sample *sample)
2468 {
2469         /*
2470          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2471          * and don't use sample->time unconditionally, we may end up having
2472          * some other event in the future without PERF_SAMPLE_TIME for good
2473          * reason, i.e. we may not be interested in its timestamps, just in
2474          * it taking place, picking some piece of information when it
2475          * appears in our event stream (vfs_getname comes to mind).
2476          */
2477         if (trace->base_time == 0 && !trace->full_time &&
2478             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2479                 trace->base_time = sample->time;
2480 }
2481
2482 static int trace__process_sample(struct perf_tool *tool,
2483                                  union perf_event *event,
2484                                  struct perf_sample *sample,
2485                                  struct perf_evsel *evsel,
2486                                  struct machine *machine __maybe_unused)
2487 {
2488         struct trace *trace = container_of(tool, struct trace, tool);
2489         struct thread *thread;
2490         int err = 0;
2491
2492         tracepoint_handler handler = evsel->handler;
2493
2494         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2495         if (thread && thread__is_filtered(thread))
2496                 goto out;
2497
2498         trace__set_base_time(trace, evsel, sample);
2499
2500         if (handler) {
2501                 ++trace->nr_events;
2502                 handler(trace, evsel, event, sample);
2503         }
2504 out:
2505         thread__put(thread);
2506         return err;
2507 }
2508
2509 static int trace__record(struct trace *trace, int argc, const char **argv)
2510 {
2511         unsigned int rec_argc, i, j;
2512         const char **rec_argv;
2513         const char * const record_args[] = {
2514                 "record",
2515                 "-R",
2516                 "-m", "1024",
2517                 "-c", "1",
2518         };
2519
2520         const char * const sc_args[] = { "-e", };
2521         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2522         const char * const majpf_args[] = { "-e", "major-faults" };
2523         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2524         const char * const minpf_args[] = { "-e", "minor-faults" };
2525         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2526
2527         /* +1 is for the event string below */
2528         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2529                 majpf_args_nr + minpf_args_nr + argc;
2530         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2531
2532         if (rec_argv == NULL)
2533                 return -ENOMEM;
2534
2535         j = 0;
2536         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2537                 rec_argv[j++] = record_args[i];
2538
2539         if (trace->trace_syscalls) {
2540                 for (i = 0; i < sc_args_nr; i++)
2541                         rec_argv[j++] = sc_args[i];
2542
2543                 /* event string may be different for older kernels - e.g., RHEL6 */
2544                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2545                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2546                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2547                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2548                 else {
2549                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2550                         free(rec_argv);
2551                         return -1;
2552                 }
2553         }
2554
2555         if (trace->trace_pgfaults & TRACE_PFMAJ)
2556                 for (i = 0; i < majpf_args_nr; i++)
2557                         rec_argv[j++] = majpf_args[i];
2558
2559         if (trace->trace_pgfaults & TRACE_PFMIN)
2560                 for (i = 0; i < minpf_args_nr; i++)
2561                         rec_argv[j++] = minpf_args[i];
2562
2563         for (i = 0; i < (unsigned int)argc; i++)
2564                 rec_argv[j++] = argv[i];
2565
2566         return cmd_record(j, rec_argv);
2567 }
2568
2569 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2570
2571 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2572 {
2573         bool found = false;
2574         struct perf_evsel *evsel, *tmp;
2575         struct parse_events_error err = { .idx = 0, };
2576         int ret = parse_events(evlist, "probe:vfs_getname*", &err);
2577
2578         if (ret)
2579                 return false;
2580
2581         evlist__for_each_entry_safe(evlist, evsel, tmp) {
2582                 if (!strstarts(perf_evsel__name(evsel), "probe:vfs_getname"))
2583                         continue;
2584
2585                 if (perf_evsel__field(evsel, "pathname")) {
2586                         evsel->handler = trace__vfs_getname;
2587                         found = true;
2588                         continue;
2589                 }
2590
2591                 list_del_init(&evsel->node);
2592                 evsel->evlist = NULL;
2593                 perf_evsel__delete(evsel);
2594         }
2595
2596         return found;
2597 }
2598
2599 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2600 {
2601         struct perf_evsel *evsel;
2602         struct perf_event_attr attr = {
2603                 .type = PERF_TYPE_SOFTWARE,
2604                 .mmap_data = 1,
2605         };
2606
2607         attr.config = config;
2608         attr.sample_period = 1;
2609
2610         event_attr_init(&attr);
2611
2612         evsel = perf_evsel__new(&attr);
2613         if (evsel)
2614                 evsel->handler = trace__pgfault;
2615
2616         return evsel;
2617 }
2618
2619 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2620 {
2621         const u32 type = event->header.type;
2622         struct perf_evsel *evsel;
2623
2624         if (type != PERF_RECORD_SAMPLE) {
2625                 trace__process_event(trace, trace->host, event, sample);
2626                 return;
2627         }
2628
2629         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2630         if (evsel == NULL) {
2631                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2632                 return;
2633         }
2634
2635         trace__set_base_time(trace, evsel, sample);
2636
2637         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2638             sample->raw_data == NULL) {
2639                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2640                        perf_evsel__name(evsel), sample->tid,
2641                        sample->cpu, sample->raw_size);
2642         } else {
2643                 tracepoint_handler handler = evsel->handler;
2644                 handler(trace, evsel, event, sample);
2645         }
2646
2647         if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX)
2648                 interrupted = true;
2649 }
2650
2651 static int trace__add_syscall_newtp(struct trace *trace)
2652 {
2653         int ret = -1;
2654         struct perf_evlist *evlist = trace->evlist;
2655         struct perf_evsel *sys_enter, *sys_exit;
2656
2657         sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
2658         if (sys_enter == NULL)
2659                 goto out;
2660
2661         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2662                 goto out_delete_sys_enter;
2663
2664         sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
2665         if (sys_exit == NULL)
2666                 goto out_delete_sys_enter;
2667
2668         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2669                 goto out_delete_sys_exit;
2670
2671         perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2672         perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2673
2674         perf_evlist__add(evlist, sys_enter);
2675         perf_evlist__add(evlist, sys_exit);
2676
2677         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2678                 /*
2679                  * We're interested only in the user space callchain
2680                  * leading to the syscall, allow overriding that for
2681                  * debugging reasons using --kernel_syscall_callchains
2682                  */
2683                 sys_exit->attr.exclude_callchain_kernel = 1;
2684         }
2685
2686         trace->syscalls.events.sys_enter = sys_enter;
2687         trace->syscalls.events.sys_exit  = sys_exit;
2688
2689         ret = 0;
2690 out:
2691         return ret;
2692
2693 out_delete_sys_exit:
2694         perf_evsel__delete_priv(sys_exit);
2695 out_delete_sys_enter:
2696         perf_evsel__delete_priv(sys_enter);
2697         goto out;
2698 }
2699
2700 static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
2701 {
2702         int err = -1;
2703         struct perf_evsel *sys_exit;
2704         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2705                                                 trace->ev_qualifier_ids.nr,
2706                                                 trace->ev_qualifier_ids.entries);
2707
2708         if (filter == NULL)
2709                 goto out_enomem;
2710
2711         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2712                                           filter)) {
2713                 sys_exit = trace->syscalls.events.sys_exit;
2714                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2715         }
2716
2717         free(filter);
2718 out:
2719         return err;
2720 out_enomem:
2721         errno = ENOMEM;
2722         goto out;
2723 }
2724
2725 #ifdef HAVE_LIBBPF_SUPPORT
2726 static void trace__init_bpf_map_syscall_args(struct trace *trace, int id, struct bpf_map_syscall_entry *entry)
2727 {
2728         struct syscall *sc = trace__syscall_info(trace, NULL, id);
2729         int arg = 0;
2730
2731         if (sc == NULL)
2732                 goto out;
2733
2734         for (; arg < sc->nr_args; ++arg) {
2735                 entry->string_args_len[arg] = 0;
2736                 if (sc->arg_fmt[arg].scnprintf == SCA_FILENAME) {
2737                         /* Should be set like strace -s strsize */
2738                         entry->string_args_len[arg] = PATH_MAX;
2739                 }
2740         }
2741 out:
2742         for (; arg < 6; ++arg)
2743                 entry->string_args_len[arg] = 0;
2744 }
2745 static int trace__set_ev_qualifier_bpf_filter(struct trace *trace)
2746 {
2747         int fd = bpf_map__fd(trace->syscalls.map);
2748         struct bpf_map_syscall_entry value = {
2749                 .enabled = !trace->not_ev_qualifier,
2750         };
2751         int err = 0;
2752         size_t i;
2753
2754         for (i = 0; i < trace->ev_qualifier_ids.nr; ++i) {
2755                 int key = trace->ev_qualifier_ids.entries[i];
2756
2757                 if (value.enabled)
2758                         trace__init_bpf_map_syscall_args(trace, key, &value);
2759
2760                 err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST);
2761                 if (err)
2762                         break;
2763         }
2764
2765         return err;
2766 }
2767
2768 static int __trace__init_syscalls_bpf_map(struct trace *trace, bool enabled)
2769 {
2770         int fd = bpf_map__fd(trace->syscalls.map);
2771         struct bpf_map_syscall_entry value = {
2772                 .enabled = enabled,
2773         };
2774         int err = 0, key;
2775
2776         for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
2777                 if (enabled)
2778                         trace__init_bpf_map_syscall_args(trace, key, &value);
2779
2780                 err = bpf_map_update_elem(fd, &key, &value, BPF_ANY);
2781                 if (err)
2782                         break;
2783         }
2784
2785         return err;
2786 }
2787
2788 static int trace__init_syscalls_bpf_map(struct trace *trace)
2789 {
2790         bool enabled = true;
2791
2792         if (trace->ev_qualifier_ids.nr)
2793                 enabled = trace->not_ev_qualifier;
2794
2795         return __trace__init_syscalls_bpf_map(trace, enabled);
2796 }
2797 #else
2798 static int trace__set_ev_qualifier_bpf_filter(struct trace *trace __maybe_unused)
2799 {
2800         return 0;
2801 }
2802
2803 static int trace__init_syscalls_bpf_map(struct trace *trace __maybe_unused)
2804 {
2805         return 0;
2806 }
2807 #endif // HAVE_LIBBPF_SUPPORT
2808
2809 static int trace__set_ev_qualifier_filter(struct trace *trace)
2810 {
2811         if (trace->syscalls.map)
2812                 return trace__set_ev_qualifier_bpf_filter(trace);
2813         if (trace->syscalls.events.sys_enter)
2814                 return trace__set_ev_qualifier_tp_filter(trace);
2815         return 0;
2816 }
2817
2818 static int bpf_map__set_filter_pids(struct bpf_map *map __maybe_unused,
2819                                     size_t npids __maybe_unused, pid_t *pids __maybe_unused)
2820 {
2821         int err = 0;
2822 #ifdef HAVE_LIBBPF_SUPPORT
2823         bool value = true;
2824         int map_fd = bpf_map__fd(map);
2825         size_t i;
2826
2827         for (i = 0; i < npids; ++i) {
2828                 err = bpf_map_update_elem(map_fd, &pids[i], &value, BPF_ANY);
2829                 if (err)
2830                         break;
2831         }
2832 #endif
2833         return err;
2834 }
2835
2836 static int trace__set_filter_loop_pids(struct trace *trace)
2837 {
2838         unsigned int nr = 1, err;
2839         pid_t pids[32] = {
2840                 getpid(),
2841         };
2842         struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2843
2844         while (thread && nr < ARRAY_SIZE(pids)) {
2845                 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2846
2847                 if (parent == NULL)
2848                         break;
2849
2850                 if (!strcmp(thread__comm_str(parent), "sshd") ||
2851                     strstarts(thread__comm_str(parent), "gnome-terminal")) {
2852                         pids[nr++] = parent->tid;
2853                         break;
2854                 }
2855                 thread = parent;
2856         }
2857
2858         err = perf_evlist__set_tp_filter_pids(trace->evlist, nr, pids);
2859         if (!err && trace->filter_pids.map)
2860                 err = bpf_map__set_filter_pids(trace->filter_pids.map, nr, pids);
2861
2862         return err;
2863 }
2864
2865 static int trace__set_filter_pids(struct trace *trace)
2866 {
2867         int err = 0;
2868         /*
2869          * Better not use !target__has_task() here because we need to cover the
2870          * case where no threads were specified in the command line, but a
2871          * workload was, and in that case we will fill in the thread_map when
2872          * we fork the workload in perf_evlist__prepare_workload.
2873          */
2874         if (trace->filter_pids.nr > 0) {
2875                 err = perf_evlist__set_tp_filter_pids(trace->evlist, trace->filter_pids.nr,
2876                                                       trace->filter_pids.entries);
2877                 if (!err && trace->filter_pids.map) {
2878                         err = bpf_map__set_filter_pids(trace->filter_pids.map, trace->filter_pids.nr,
2879                                                        trace->filter_pids.entries);
2880                 }
2881         } else if (thread_map__pid(trace->evlist->threads, 0) == -1) {
2882                 err = trace__set_filter_loop_pids(trace);
2883         }
2884
2885         return err;
2886 }
2887
2888 static int __trace__deliver_event(struct trace *trace, union perf_event *event)
2889 {
2890         struct perf_evlist *evlist = trace->evlist;
2891         struct perf_sample sample;
2892         int err;
2893
2894         err = perf_evlist__parse_sample(evlist, event, &sample);
2895         if (err)
2896                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2897         else
2898                 trace__handle_event(trace, event, &sample);
2899
2900         return 0;
2901 }
2902
2903 static int __trace__flush_events(struct trace *trace)
2904 {
2905         u64 first = ordered_events__first_time(&trace->oe.data);
2906         u64 flush = trace->oe.last - NSEC_PER_SEC;
2907
2908         /* Is there some thing to flush.. */
2909         if (first && first < flush)
2910                 return ordered_events__flush_time(&trace->oe.data, flush);
2911
2912         return 0;
2913 }
2914
2915 static int trace__flush_events(struct trace *trace)
2916 {
2917         return !trace->sort_events ? 0 : __trace__flush_events(trace);
2918 }
2919
2920 static int trace__deliver_event(struct trace *trace, union perf_event *event)
2921 {
2922         int err;
2923
2924         if (!trace->sort_events)
2925                 return __trace__deliver_event(trace, event);
2926
2927         err = perf_evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last);
2928         if (err && err != -1)
2929                 return err;
2930
2931         err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0);
2932         if (err)
2933                 return err;
2934
2935         return trace__flush_events(trace);
2936 }
2937
2938 static int ordered_events__deliver_event(struct ordered_events *oe,
2939                                          struct ordered_event *event)
2940 {
2941         struct trace *trace = container_of(oe, struct trace, oe.data);
2942
2943         return __trace__deliver_event(trace, event->event);
2944 }
2945
2946 static int trace__run(struct trace *trace, int argc, const char **argv)
2947 {
2948         struct perf_evlist *evlist = trace->evlist;
2949         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2950         int err = -1, i;
2951         unsigned long before;
2952         const bool forks = argc > 0;
2953         bool draining = false;
2954
2955         trace->live = true;
2956
2957         if (!trace->raw_augmented_syscalls) {
2958                 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2959                         goto out_error_raw_syscalls;
2960
2961                 if (trace->trace_syscalls)
2962                         trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2963         }
2964
2965         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2966                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2967                 if (pgfault_maj == NULL)
2968                         goto out_error_mem;
2969                 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2970                 perf_evlist__add(evlist, pgfault_maj);
2971         }
2972
2973         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2974                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2975                 if (pgfault_min == NULL)
2976                         goto out_error_mem;
2977                 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2978                 perf_evlist__add(evlist, pgfault_min);
2979         }
2980
2981         if (trace->sched &&
2982             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2983                                    trace__sched_stat_runtime))
2984                 goto out_error_sched_stat_runtime;
2985
2986         /*
2987          * If a global cgroup was set, apply it to all the events without an
2988          * explicit cgroup. I.e.:
2989          *
2990          *      trace -G A -e sched:*switch
2991          *
2992          * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2993          * _and_ sched:sched_switch to the 'A' cgroup, while:
2994          *
2995          * trace -e sched:*switch -G A
2996          *
2997          * will only set the sched:sched_switch event to the 'A' cgroup, all the
2998          * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2999          * a cgroup (on the root cgroup, sys wide, etc).
3000          *
3001          * Multiple cgroups:
3002          *
3003          * trace -G A -e sched:*switch -G B
3004          *
3005          * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
3006          * to the 'B' cgroup.
3007          *
3008          * evlist__set_default_cgroup() grabs a reference of the passed cgroup
3009          * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
3010          */
3011         if (trace->cgroup)
3012                 evlist__set_default_cgroup(trace->evlist, trace->cgroup);
3013
3014         err = perf_evlist__create_maps(evlist, &trace->opts.target);
3015         if (err < 0) {
3016                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
3017                 goto out_delete_evlist;
3018         }
3019
3020         err = trace__symbols_init(trace, evlist);
3021         if (err < 0) {
3022                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
3023                 goto out_delete_evlist;
3024         }
3025
3026         perf_evlist__config(evlist, &trace->opts, &callchain_param);
3027
3028         signal(SIGCHLD, sig_handler);
3029         signal(SIGINT, sig_handler);
3030
3031         if (forks) {
3032                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
3033                                                     argv, false, NULL);
3034                 if (err < 0) {
3035                         fprintf(trace->output, "Couldn't run the workload!\n");
3036                         goto out_delete_evlist;
3037                 }
3038         }
3039
3040         err = perf_evlist__open(evlist);
3041         if (err < 0)
3042                 goto out_error_open;
3043
3044         err = bpf__apply_obj_config();
3045         if (err) {
3046                 char errbuf[BUFSIZ];
3047
3048                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
3049                 pr_err("ERROR: Apply config to BPF failed: %s\n",
3050                          errbuf);
3051                 goto out_error_open;
3052         }
3053
3054         err = trace__set_filter_pids(trace);
3055         if (err < 0)
3056                 goto out_error_mem;
3057
3058         if (trace->syscalls.map)
3059                 trace__init_syscalls_bpf_map(trace);
3060
3061         if (trace->ev_qualifier_ids.nr > 0) {
3062                 err = trace__set_ev_qualifier_filter(trace);
3063                 if (err < 0)
3064                         goto out_errno;
3065
3066                 if (trace->syscalls.events.sys_exit) {
3067                         pr_debug("event qualifier tracepoint filter: %s\n",
3068                                  trace->syscalls.events.sys_exit->filter);
3069                 }
3070         }
3071
3072         err = perf_evlist__apply_filters(evlist, &evsel);
3073         if (err < 0)
3074                 goto out_error_apply_filters;
3075
3076         if (trace->dump.map)
3077                 bpf_map__fprintf(trace->dump.map, trace->output);
3078
3079         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
3080         if (err < 0)
3081                 goto out_error_mmap;
3082
3083         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
3084                 perf_evlist__enable(evlist);
3085
3086         if (forks)
3087                 perf_evlist__start_workload(evlist);
3088
3089         if (trace->opts.initial_delay) {
3090                 usleep(trace->opts.initial_delay * 1000);
3091                 perf_evlist__enable(evlist);
3092         }
3093
3094         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
3095                                   evlist->threads->nr > 1 ||
3096                                   perf_evlist__first(evlist)->attr.inherit;
3097
3098         /*
3099          * Now that we already used evsel->attr to ask the kernel to setup the
3100          * events, lets reuse evsel->attr.sample_max_stack as the limit in
3101          * trace__resolve_callchain(), allowing per-event max-stack settings
3102          * to override an explicitly set --max-stack global setting.
3103          */
3104         evlist__for_each_entry(evlist, evsel) {
3105                 if (evsel__has_callchain(evsel) &&
3106                     evsel->attr.sample_max_stack == 0)
3107                         evsel->attr.sample_max_stack = trace->max_stack;
3108         }
3109 again:
3110         before = trace->nr_events;
3111
3112         for (i = 0; i < evlist->nr_mmaps; i++) {
3113                 union perf_event *event;
3114                 struct perf_mmap *md;
3115
3116                 md = &evlist->mmap[i];
3117                 if (perf_mmap__read_init(md) < 0)
3118                         continue;
3119
3120                 while ((event = perf_mmap__read_event(md)) != NULL) {
3121                         ++trace->nr_events;
3122
3123                         err = trace__deliver_event(trace, event);
3124                         if (err)
3125                                 goto out_disable;
3126
3127                         perf_mmap__consume(md);
3128
3129                         if (interrupted)
3130                                 goto out_disable;
3131
3132                         if (done && !draining) {
3133                                 perf_evlist__disable(evlist);
3134                                 draining = true;
3135                         }
3136                 }
3137                 perf_mmap__read_done(md);
3138         }
3139
3140         if (trace->nr_events == before) {
3141                 int timeout = done ? 100 : -1;
3142
3143                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
3144                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0)
3145                                 draining = true;
3146
3147                         goto again;
3148                 } else {
3149                         if (trace__flush_events(trace))
3150                                 goto out_disable;
3151                 }
3152         } else {
3153                 goto again;
3154         }
3155
3156 out_disable:
3157         thread__zput(trace->current);
3158
3159         perf_evlist__disable(evlist);
3160
3161         if (trace->sort_events)
3162                 ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL);
3163
3164         if (!err) {
3165                 if (trace->summary)
3166                         trace__fprintf_thread_summary(trace, trace->output);
3167
3168                 if (trace->show_tool_stats) {
3169                         fprintf(trace->output, "Stats:\n "
3170                                                " vfs_getname : %" PRIu64 "\n"
3171                                                " proc_getname: %" PRIu64 "\n",
3172                                 trace->stats.vfs_getname,
3173                                 trace->stats.proc_getname);
3174                 }
3175         }
3176
3177 out_delete_evlist:
3178         trace__symbols__exit(trace);
3179
3180         perf_evlist__delete(evlist);
3181         cgroup__put(trace->cgroup);
3182         trace->evlist = NULL;
3183         trace->live = false;
3184         return err;
3185 {
3186         char errbuf[BUFSIZ];
3187
3188 out_error_sched_stat_runtime:
3189         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
3190         goto out_error;
3191
3192 out_error_raw_syscalls:
3193         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
3194         goto out_error;
3195
3196 out_error_mmap:
3197         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
3198         goto out_error;
3199
3200 out_error_open:
3201         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
3202
3203 out_error:
3204         fprintf(trace->output, "%s\n", errbuf);
3205         goto out_delete_evlist;
3206
3207 out_error_apply_filters:
3208         fprintf(trace->output,
3209                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
3210                 evsel->filter, perf_evsel__name(evsel), errno,
3211                 str_error_r(errno, errbuf, sizeof(errbuf)));
3212         goto out_delete_evlist;
3213 }
3214 out_error_mem:
3215         fprintf(trace->output, "Not enough memory to run!\n");
3216         goto out_delete_evlist;
3217
3218 out_errno:
3219         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
3220         goto out_delete_evlist;
3221 }
3222
3223 static int trace__replay(struct trace *trace)
3224 {
3225         const struct perf_evsel_str_handler handlers[] = {
3226                 { "probe:vfs_getname",       trace__vfs_getname, },
3227         };
3228         struct perf_data data = {
3229                 .path  = input_name,
3230                 .mode  = PERF_DATA_MODE_READ,
3231                 .force = trace->force,
3232         };
3233         struct perf_session *session;
3234         struct perf_evsel *evsel;
3235         int err = -1;
3236
3237         trace->tool.sample        = trace__process_sample;
3238         trace->tool.mmap          = perf_event__process_mmap;
3239         trace->tool.mmap2         = perf_event__process_mmap2;
3240         trace->tool.comm          = perf_event__process_comm;
3241         trace->tool.exit          = perf_event__process_exit;
3242         trace->tool.fork          = perf_event__process_fork;
3243         trace->tool.attr          = perf_event__process_attr;
3244         trace->tool.tracing_data  = perf_event__process_tracing_data;
3245         trace->tool.build_id      = perf_event__process_build_id;
3246         trace->tool.namespaces    = perf_event__process_namespaces;
3247
3248         trace->tool.ordered_events = true;
3249         trace->tool.ordering_requires_timestamps = true;
3250
3251         /* add tid to output */
3252         trace->multiple_threads = true;
3253
3254         session = perf_session__new(&data, false, &trace->tool);
3255         if (session == NULL)
3256                 return -1;
3257
3258         if (trace->opts.target.pid)
3259                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
3260
3261         if (trace->opts.target.tid)
3262                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
3263
3264         if (symbol__init(&session->header.env) < 0)
3265                 goto out;
3266
3267         trace->host = &session->machines.host;
3268
3269         err = perf_session__set_tracepoints_handlers(session, handlers);
3270         if (err)
3271                 goto out;
3272
3273         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3274                                                      "raw_syscalls:sys_enter");
3275         /* older kernels have syscalls tp versus raw_syscalls */
3276         if (evsel == NULL)
3277                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3278                                                              "syscalls:sys_enter");
3279
3280         if (evsel &&
3281             (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
3282             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
3283                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
3284                 goto out;
3285         }
3286
3287         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3288                                                      "raw_syscalls:sys_exit");
3289         if (evsel == NULL)
3290                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
3291                                                              "syscalls:sys_exit");
3292         if (evsel &&
3293             (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
3294             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
3295                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
3296                 goto out;
3297         }
3298
3299         evlist__for_each_entry(session->evlist, evsel) {
3300                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
3301                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
3302                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
3303                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
3304                         evsel->handler = trace__pgfault;
3305         }
3306
3307         setup_pager();
3308
3309         err = perf_session__process_events(session);
3310         if (err)
3311                 pr_err("Failed to process events, error %d", err);
3312
3313         else if (trace->summary)
3314                 trace__fprintf_thread_summary(trace, trace->output);
3315
3316 out:
3317         perf_session__delete(session);
3318
3319         return err;
3320 }
3321
3322 static size_t trace__fprintf_threads_header(FILE *fp)
3323 {
3324         size_t printed;
3325
3326         printed  = fprintf(fp, "\n Summary of events:\n\n");
3327
3328         return printed;
3329 }
3330
3331 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
3332         struct stats    *stats;
3333         double          msecs;
3334         int             syscall;
3335 )
3336 {
3337         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
3338         struct stats *stats = source->priv;
3339
3340         entry->syscall = source->i;
3341         entry->stats   = stats;
3342         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
3343 }
3344
3345 static size_t thread__dump_stats(struct thread_trace *ttrace,
3346                                  struct trace *trace, FILE *fp)
3347 {
3348         size_t printed = 0;
3349         struct syscall *sc;
3350         struct rb_node *nd;
3351         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
3352
3353         if (syscall_stats == NULL)
3354                 return 0;
3355
3356         printed += fprintf(fp, "\n");
3357
3358         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
3359         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
3360         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
3361
3362         resort_rb__for_each_entry(nd, syscall_stats) {
3363                 struct stats *stats = syscall_stats_entry->stats;
3364                 if (stats) {
3365                         double min = (double)(stats->min) / NSEC_PER_MSEC;
3366                         double max = (double)(stats->max) / NSEC_PER_MSEC;
3367                         double avg = avg_stats(stats);
3368                         double pct;
3369                         u64 n = (u64) stats->n;
3370
3371                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
3372                         avg /= NSEC_PER_MSEC;
3373
3374                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
3375                         printed += fprintf(fp, "   %-15s", sc->name);
3376                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
3377                                            n, syscall_stats_entry->msecs, min, avg);
3378                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
3379                 }
3380         }
3381
3382         resort_rb__delete(syscall_stats);
3383         printed += fprintf(fp, "\n\n");
3384
3385         return printed;
3386 }
3387
3388 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
3389 {
3390         size_t printed = 0;
3391         struct thread_trace *ttrace = thread__priv(thread);
3392         double ratio;
3393
3394         if (ttrace == NULL)
3395                 return 0;
3396
3397         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
3398
3399         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
3400         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
3401         printed += fprintf(fp, "%.1f%%", ratio);
3402         if (ttrace->pfmaj)
3403                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
3404         if (ttrace->pfmin)
3405                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
3406         if (trace->sched)
3407                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
3408         else if (fputc('\n', fp) != EOF)
3409                 ++printed;
3410
3411         printed += thread__dump_stats(ttrace, trace, fp);
3412
3413         return printed;
3414 }
3415
3416 static unsigned long thread__nr_events(struct thread_trace *ttrace)
3417 {
3418         return ttrace ? ttrace->nr_events : 0;
3419 }
3420
3421 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
3422         struct thread *thread;
3423 )
3424 {
3425         entry->thread = rb_entry(nd, struct thread, rb_node);
3426 }
3427
3428 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
3429 {
3430         size_t printed = trace__fprintf_threads_header(fp);
3431         struct rb_node *nd;
3432         int i;
3433
3434         for (i = 0; i < THREADS__TABLE_SIZE; i++) {
3435                 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
3436
3437                 if (threads == NULL) {
3438                         fprintf(fp, "%s", "Error sorting output by nr_events!\n");
3439                         return 0;
3440                 }
3441
3442                 resort_rb__for_each_entry(nd, threads)
3443                         printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
3444
3445                 resort_rb__delete(threads);
3446         }
3447         return printed;
3448 }
3449
3450 static int trace__set_duration(const struct option *opt, const char *str,
3451                                int unset __maybe_unused)
3452 {
3453         struct trace *trace = opt->value;
3454
3455         trace->duration_filter = atof(str);
3456         return 0;
3457 }
3458
3459 static int trace__set_filter_pids_from_option(const struct option *opt, const char *str,
3460                                               int unset __maybe_unused)
3461 {
3462         int ret = -1;
3463         size_t i;
3464         struct trace *trace = opt->value;
3465         /*
3466          * FIXME: introduce a intarray class, plain parse csv and create a
3467          * { int nr, int entries[] } struct...
3468          */
3469         struct intlist *list = intlist__new(str);
3470
3471         if (list == NULL)
3472                 return -1;
3473
3474         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
3475         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
3476
3477         if (trace->filter_pids.entries == NULL)
3478                 goto out;
3479
3480         trace->filter_pids.entries[0] = getpid();
3481
3482         for (i = 1; i < trace->filter_pids.nr; ++i)
3483                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
3484
3485         intlist__delete(list);
3486         ret = 0;
3487 out:
3488         return ret;
3489 }
3490
3491 static int trace__open_output(struct trace *trace, const char *filename)
3492 {
3493         struct stat st;
3494
3495         if (!stat(filename, &st) && st.st_size) {
3496                 char oldname[PATH_MAX];
3497
3498                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
3499                 unlink(oldname);
3500                 rename(filename, oldname);
3501         }
3502
3503         trace->output = fopen(filename, "w");
3504
3505         return trace->output == NULL ? -errno : 0;
3506 }
3507
3508 static int parse_pagefaults(const struct option *opt, const char *str,
3509                             int unset __maybe_unused)
3510 {
3511         int *trace_pgfaults = opt->value;
3512
3513         if (strcmp(str, "all") == 0)
3514                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
3515         else if (strcmp(str, "maj") == 0)
3516                 *trace_pgfaults |= TRACE_PFMAJ;
3517         else if (strcmp(str, "min") == 0)
3518                 *trace_pgfaults |= TRACE_PFMIN;
3519         else
3520                 return -1;
3521
3522         return 0;
3523 }
3524
3525 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3526 {
3527         struct perf_evsel *evsel;
3528
3529         evlist__for_each_entry(evlist, evsel)
3530                 evsel->handler = handler;
3531 }
3532
3533 static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist)
3534 {
3535         struct perf_evsel *evsel;
3536
3537         evlist__for_each_entry(evlist, evsel) {
3538                 if (evsel->priv || !evsel->tp_format)
3539                         continue;
3540
3541                 if (strcmp(evsel->tp_format->system, "syscalls"))
3542                         continue;
3543
3544                 if (perf_evsel__init_syscall_tp(evsel))
3545                         return -1;
3546
3547                 if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
3548                         struct syscall_tp *sc = evsel->priv;
3549
3550                         if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
3551                                 return -1;
3552                 } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
3553                         struct syscall_tp *sc = evsel->priv;
3554
3555                         if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
3556                                 return -1;
3557                 }
3558         }
3559
3560         return 0;
3561 }
3562
3563 /*
3564  * XXX: Hackish, just splitting the combined -e+--event (syscalls
3565  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
3566  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
3567  *
3568  * It'd be better to introduce a parse_options() variant that would return a
3569  * list with the terms it didn't match to an event...
3570  */
3571 static int trace__parse_events_option(const struct option *opt, const char *str,
3572                                       int unset __maybe_unused)
3573 {
3574         struct trace *trace = (struct trace *)opt->value;
3575         const char *s = str;
3576         char *sep = NULL, *lists[2] = { NULL, NULL, };
3577         int len = strlen(str) + 1, err = -1, list, idx;
3578         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
3579         char group_name[PATH_MAX];
3580         struct syscall_fmt *fmt;
3581
3582         if (strace_groups_dir == NULL)
3583                 return -1;
3584
3585         if (*s == '!') {
3586                 ++s;
3587                 trace->not_ev_qualifier = true;
3588         }
3589
3590         while (1) {
3591                 if ((sep = strchr(s, ',')) != NULL)
3592                         *sep = '\0';
3593
3594                 list = 0;
3595                 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
3596                     syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
3597                         list = 1;
3598                         goto do_concat;
3599                 }
3600
3601                 fmt = syscall_fmt__find_by_alias(s);
3602                 if (fmt != NULL) {
3603                         list = 1;
3604                         s = fmt->name;
3605                 } else {
3606                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
3607                         if (access(group_name, R_OK) == 0)
3608                                 list = 1;
3609                 }
3610 do_concat:
3611                 if (lists[list]) {
3612                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
3613                 } else {
3614                         lists[list] = malloc(len);
3615                         if (lists[list] == NULL)
3616                                 goto out;
3617                         strcpy(lists[list], s);
3618                 }
3619
3620                 if (!sep)
3621                         break;
3622
3623                 *sep = ',';
3624                 s = sep + 1;
3625         }
3626
3627         if (lists[1] != NULL) {
3628                 struct strlist_config slist_config = {
3629                         .dirname = strace_groups_dir,
3630                 };
3631
3632                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
3633                 if (trace->ev_qualifier == NULL) {
3634                         fputs("Not enough memory to parse event qualifier", trace->output);
3635                         goto out;
3636                 }
3637
3638                 if (trace__validate_ev_qualifier(trace))
3639                         goto out;
3640                 trace->trace_syscalls = true;
3641         }
3642
3643         err = 0;
3644
3645         if (lists[0]) {
3646                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3647                                                "event selector. use 'perf list' to list available events",
3648                                                parse_events_option);
3649                 err = parse_events_option(&o, lists[0], 0);
3650         }
3651 out:
3652         if (sep)
3653                 *sep = ',';
3654
3655         return err;
3656 }
3657
3658 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3659 {
3660         struct trace *trace = opt->value;
3661
3662         if (!list_empty(&trace->evlist->entries))
3663                 return parse_cgroups(opt, str, unset);
3664
3665         trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3666
3667         return 0;
3668 }
3669
3670 static struct bpf_map *bpf__find_map_by_name(const char *name)
3671 {
3672         struct bpf_object *obj, *tmp;
3673
3674         bpf_object__for_each_safe(obj, tmp) {
3675                 struct bpf_map *map = bpf_object__find_map_by_name(obj, name);
3676                 if (map)
3677                         return map;
3678
3679         }
3680
3681         return NULL;
3682 }
3683
3684 static void trace__set_bpf_map_filtered_pids(struct trace *trace)
3685 {
3686         trace->filter_pids.map = bpf__find_map_by_name("pids_filtered");
3687 }
3688
3689 static void trace__set_bpf_map_syscalls(struct trace *trace)
3690 {
3691         trace->syscalls.map = bpf__find_map_by_name("syscalls");
3692 }
3693
3694 static int trace__config(const char *var, const char *value, void *arg)
3695 {
3696         struct trace *trace = arg;
3697         int err = 0;
3698
3699         if (!strcmp(var, "trace.add_events")) {
3700                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3701                                                "event selector. use 'perf list' to list available events",
3702                                                parse_events_option);
3703                 /*
3704                  * We can't propagate parse_event_option() return, as it is 1
3705                  * for failure while perf_config() expects -1.
3706                  */
3707                 if (parse_events_option(&o, value, 0))
3708                         err = -1;
3709         } else if (!strcmp(var, "trace.show_timestamp")) {
3710                 trace->show_tstamp = perf_config_bool(var, value);
3711         } else if (!strcmp(var, "trace.show_duration")) {
3712                 trace->show_duration = perf_config_bool(var, value);
3713         } else if (!strcmp(var, "trace.show_arg_names")) {
3714                 trace->show_arg_names = perf_config_bool(var, value);
3715                 if (!trace->show_arg_names)
3716                         trace->show_zeros = true;
3717         } else if (!strcmp(var, "trace.show_zeros")) {
3718                 bool new_show_zeros = perf_config_bool(var, value);
3719                 if (!trace->show_arg_names && !new_show_zeros) {
3720                         pr_warning("trace.show_zeros has to be set when trace.show_arg_names=no\n");
3721                         goto out;
3722                 }
3723                 trace->show_zeros = new_show_zeros;
3724         } else if (!strcmp(var, "trace.show_prefix")) {
3725                 trace->show_string_prefix = perf_config_bool(var, value);
3726         } else if (!strcmp(var, "trace.no_inherit")) {
3727                 trace->opts.no_inherit = perf_config_bool(var, value);
3728         } else if (!strcmp(var, "trace.args_alignment")) {
3729                 int args_alignment = 0;
3730                 if (perf_config_int(&args_alignment, var, value) == 0)
3731                         trace->args_alignment = args_alignment;
3732         }
3733 out:
3734         return err;
3735 }
3736
3737 int cmd_trace(int argc, const char **argv)
3738 {
3739         const char *trace_usage[] = {
3740                 "perf trace [<options>] [<command>]",
3741                 "perf trace [<options>] -- <command> [<options>]",
3742                 "perf trace record [<options>] [<command>]",
3743                 "perf trace record [<options>] -- <command> [<options>]",
3744                 NULL
3745         };
3746         struct trace trace = {
3747                 .syscalls = {
3748                         . max = -1,
3749                 },
3750                 .opts = {
3751                         .target = {
3752                                 .uid       = UINT_MAX,
3753                                 .uses_mmap = true,
3754                         },
3755                         .user_freq     = UINT_MAX,
3756                         .user_interval = ULLONG_MAX,
3757                         .no_buffering  = true,
3758                         .mmap_pages    = UINT_MAX,
3759                 },
3760                 .output = stderr,
3761                 .show_comm = true,
3762                 .show_tstamp = true,
3763                 .show_duration = true,
3764                 .show_arg_names = true,
3765                 .args_alignment = 70,
3766                 .trace_syscalls = false,
3767                 .kernel_syscallchains = false,
3768                 .max_stack = UINT_MAX,
3769                 .max_events = ULONG_MAX,
3770         };
3771         const char *map_dump_str = NULL;
3772         const char *output_name = NULL;
3773         const struct option trace_options[] = {
3774         OPT_CALLBACK('e', "event", &trace, "event",
3775                      "event/syscall selector. use 'perf list' to list available events",
3776                      trace__parse_events_option),
3777         OPT_BOOLEAN(0, "comm", &trace.show_comm,
3778                     "show the thread COMM next to its id"),
3779         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3780         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3781                      trace__parse_events_option),
3782         OPT_STRING('o', "output", &output_name, "file", "output file name"),
3783         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3784         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3785                     "trace events on existing process id"),
3786         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3787                     "trace events on existing thread id"),
3788         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3789                      "pids to filter (by the kernel)", trace__set_filter_pids_from_option),
3790         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3791                     "system-wide collection from all CPUs"),
3792         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3793                     "list of cpus to monitor"),
3794         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3795                     "child tasks do not inherit counters"),
3796         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3797                      "number of mmap data pages",
3798                      perf_evlist__parse_mmap_pages),
3799         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3800                    "user to profile"),
3801         OPT_CALLBACK(0, "duration", &trace, "float",
3802                      "show only events with duration > N.M ms",
3803                      trace__set_duration),
3804 #ifdef HAVE_LIBBPF_SUPPORT
3805         OPT_STRING(0, "map-dump", &map_dump_str, "BPF map", "BPF map to periodically dump"),
3806 #endif
3807         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3808         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3809         OPT_BOOLEAN('T', "time", &trace.full_time,
3810                     "Show full timestamp, not time relative to first start"),
3811         OPT_BOOLEAN(0, "failure", &trace.failure_only,
3812                     "Show only syscalls that failed"),
3813         OPT_BOOLEAN('s', "summary", &trace.summary_only,
3814                     "Show only syscall summary with statistics"),
3815         OPT_BOOLEAN('S', "with-summary", &trace.summary,
3816                     "Show all syscalls and summary with statistics"),
3817         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3818                      "Trace pagefaults", parse_pagefaults, "maj"),
3819         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3820         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3821         OPT_CALLBACK(0, "call-graph", &trace.opts,
3822                      "record_mode[,record_size]", record_callchain_help,
3823                      &record_parse_callchain_opt),
3824         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3825                     "Show the kernel callchains on the syscall exit path"),
3826         OPT_ULONG(0, "max-events", &trace.max_events,
3827                 "Set the maximum number of events to print, exit after that is reached. "),
3828         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3829                      "Set the minimum stack depth when parsing the callchain, "
3830                      "anything below the specified depth will be ignored."),
3831         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3832                      "Set the maximum stack depth when parsing the callchain, "
3833                      "anything beyond the specified depth will be ignored. "
3834                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3835         OPT_BOOLEAN(0, "sort-events", &trace.sort_events,
3836                         "Sort batch of events before processing, use if getting out of order events"),
3837         OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3838                         "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3839         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3840                         "per thread proc mmap processing timeout in ms"),
3841         OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3842                      trace__parse_cgroups),
3843         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3844                      "ms to wait before starting measurement after program "
3845                      "start"),
3846         OPT_END()
3847         };
3848         bool __maybe_unused max_stack_user_set = true;
3849         bool mmap_pages_user_set = true;
3850         struct perf_evsel *evsel;
3851         const char * const trace_subcommands[] = { "record", NULL };
3852         int err = -1;
3853         char bf[BUFSIZ];
3854
3855         signal(SIGSEGV, sighandler_dump_stack);
3856         signal(SIGFPE, sighandler_dump_stack);
3857
3858         trace.evlist = perf_evlist__new();
3859         trace.sctbl = syscalltbl__new();
3860
3861         if (trace.evlist == NULL || trace.sctbl == NULL) {
3862                 pr_err("Not enough memory to run!\n");
3863                 err = -ENOMEM;
3864                 goto out;
3865         }
3866
3867         err = perf_config(trace__config, &trace);
3868         if (err)
3869                 goto out;
3870
3871         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3872                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3873
3874         if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3875                 usage_with_options_msg(trace_usage, trace_options,
3876                                        "cgroup monitoring only available in system-wide mode");
3877         }
3878
3879         evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__");
3880         if (IS_ERR(evsel)) {
3881                 bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf));
3882                 pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf);
3883                 goto out;
3884         }
3885
3886         if (evsel) {
3887                 trace.syscalls.events.augmented = evsel;
3888                 trace__set_bpf_map_filtered_pids(&trace);
3889                 trace__set_bpf_map_syscalls(&trace);
3890         }
3891
3892         err = bpf__setup_stdout(trace.evlist);
3893         if (err) {
3894                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3895                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3896                 goto out;
3897         }
3898
3899         err = -1;
3900
3901         if (map_dump_str) {
3902                 trace.dump.map = bpf__find_map_by_name(map_dump_str);
3903                 if (trace.dump.map == NULL) {
3904                         pr_err("ERROR: BPF map \"%s\" not found\n", map_dump_str);
3905                         goto out;
3906                 }
3907         }
3908
3909         if (trace.trace_pgfaults) {
3910                 trace.opts.sample_address = true;
3911                 trace.opts.sample_time = true;
3912         }
3913
3914         if (trace.opts.mmap_pages == UINT_MAX)
3915                 mmap_pages_user_set = false;
3916
3917         if (trace.max_stack == UINT_MAX) {
3918                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3919                 max_stack_user_set = false;
3920         }
3921
3922 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3923         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3924                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3925         }
3926 #endif
3927
3928         if (callchain_param.enabled) {
3929                 if (!mmap_pages_user_set && geteuid() == 0)
3930                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3931
3932                 symbol_conf.use_callchain = true;
3933         }
3934
3935         if (trace.evlist->nr_entries > 0) {
3936                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3937                 if (evlist__set_syscall_tp_fields(trace.evlist)) {
3938                         perror("failed to set syscalls:* tracepoint fields");
3939                         goto out;
3940                 }
3941         }
3942
3943         if (trace.sort_events) {
3944                 ordered_events__init(&trace.oe.data, ordered_events__deliver_event, &trace);
3945                 ordered_events__set_copy_on_queue(&trace.oe.data, true);
3946         }
3947
3948         /*
3949          * If we are augmenting syscalls, then combine what we put in the
3950          * __augmented_syscalls__ BPF map with what is in the
3951          * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF,
3952          * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit.
3953          *
3954          * We'll switch to look at two BPF maps, one for sys_enter and the
3955          * other for sys_exit when we start augmenting the sys_exit paths with
3956          * buffers that are being copied from kernel to userspace, think 'read'
3957          * syscall.
3958          */
3959         if (trace.syscalls.events.augmented) {
3960                 evlist__for_each_entry(trace.evlist, evsel) {
3961                         bool raw_syscalls_sys_exit = strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_exit") == 0;
3962
3963                         if (raw_syscalls_sys_exit) {
3964                                 trace.raw_augmented_syscalls = true;
3965                                 goto init_augmented_syscall_tp;
3966                         }
3967
3968                         if (trace.syscalls.events.augmented->priv == NULL &&
3969                             strstr(perf_evsel__name(evsel), "syscalls:sys_enter")) {
3970                                 struct perf_evsel *augmented = trace.syscalls.events.augmented;
3971                                 if (perf_evsel__init_augmented_syscall_tp(augmented, evsel) ||
3972                                     perf_evsel__init_augmented_syscall_tp_args(augmented))
3973                                         goto out;
3974                                 augmented->handler = trace__sys_enter;
3975                         }
3976
3977                         if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) {
3978                                 struct syscall_tp *sc;
3979 init_augmented_syscall_tp:
3980                                 if (perf_evsel__init_augmented_syscall_tp(evsel, evsel))
3981                                         goto out;
3982                                 sc = evsel->priv;
3983                                 /*
3984                                  * For now with BPF raw_augmented we hook into
3985                                  * raw_syscalls:sys_enter and there we get all
3986                                  * 6 syscall args plus the tracepoint common
3987                                  * fields and the syscall_nr (another long).
3988                                  * So we check if that is the case and if so
3989                                  * don't look after the sc->args_size but
3990                                  * always after the full raw_syscalls:sys_enter
3991                                  * payload, which is fixed.
3992                                  *
3993                                  * We'll revisit this later to pass
3994                                  * s->args_size to the BPF augmenter (now
3995                                  * tools/perf/examples/bpf/augmented_raw_syscalls.c,
3996                                  * so that it copies only what we need for each
3997                                  * syscall, like what happens when we use
3998                                  * syscalls:sys_enter_NAME, so that we reduce
3999                                  * the kernel/userspace traffic to just what is
4000                                  * needed for each syscall.
4001                                  */
4002                                 if (trace.raw_augmented_syscalls)
4003                                         trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset;
4004                                 perf_evsel__init_augmented_syscall_tp_ret(evsel);
4005                                 evsel->handler = trace__sys_exit;
4006                         }
4007                 }
4008         }
4009
4010         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
4011                 return trace__record(&trace, argc-1, &argv[1]);
4012
4013         /* summary_only implies summary option, but don't overwrite summary if set */
4014         if (trace.summary_only)
4015                 trace.summary = trace.summary_only;
4016
4017         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
4018             trace.evlist->nr_entries == 0 /* Was --events used? */) {
4019                 trace.trace_syscalls = true;
4020         }
4021
4022         if (output_name != NULL) {
4023                 err = trace__open_output(&trace, output_name);
4024                 if (err < 0) {
4025                         perror("failed to create output file");
4026                         goto out;
4027                 }
4028         }
4029
4030         err = target__validate(&trace.opts.target);
4031         if (err) {
4032                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
4033                 fprintf(trace.output, "%s", bf);
4034                 goto out_close;
4035         }
4036
4037         err = target__parse_uid(&trace.opts.target);
4038         if (err) {
4039                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
4040                 fprintf(trace.output, "%s", bf);
4041                 goto out_close;
4042         }
4043
4044         if (!argc && target__none(&trace.opts.target))
4045                 trace.opts.target.system_wide = true;
4046
4047         if (input_name)
4048                 err = trace__replay(&trace);
4049         else
4050                 err = trace__run(&trace, argc, argv);
4051
4052 out_close:
4053         if (output_name != NULL)
4054                 fclose(trace.output);
4055 out:
4056         return err;
4057 }