GNU Linux-libre 4.9.337-gnu1
[releases.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 #include "callchain.h"
38 #include "syscalltbl.h"
39 #include "rb_resort.h"
40
41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
42 #include <stdlib.h>
43 #include <linux/err.h>
44 #include <linux/filter.h>
45 #include <linux/audit.h>
46 #include <linux/random.h>
47 #include <linux/stringify.h>
48 #include <linux/time64.h>
49
50 #ifndef O_CLOEXEC
51 # define O_CLOEXEC              02000000
52 #endif
53
54 struct trace {
55         struct perf_tool        tool;
56         struct syscalltbl       *sctbl;
57         struct {
58                 int             max;
59                 struct syscall  *table;
60                 struct {
61                         struct perf_evsel *sys_enter,
62                                           *sys_exit;
63                 }               events;
64         } syscalls;
65         struct record_opts      opts;
66         struct perf_evlist      *evlist;
67         struct machine          *host;
68         struct thread           *current;
69         u64                     base_time;
70         FILE                    *output;
71         unsigned long           nr_events;
72         struct strlist          *ev_qualifier;
73         struct {
74                 size_t          nr;
75                 int             *entries;
76         }                       ev_qualifier_ids;
77         struct intlist          *tid_list;
78         struct intlist          *pid_list;
79         struct {
80                 size_t          nr;
81                 pid_t           *entries;
82         }                       filter_pids;
83         double                  duration_filter;
84         double                  runtime_ms;
85         struct {
86                 u64             vfs_getname,
87                                 proc_getname;
88         } stats;
89         unsigned int            max_stack;
90         unsigned int            min_stack;
91         bool                    not_ev_qualifier;
92         bool                    live;
93         bool                    full_time;
94         bool                    sched;
95         bool                    multiple_threads;
96         bool                    summary;
97         bool                    summary_only;
98         bool                    show_comm;
99         bool                    show_tool_stats;
100         bool                    trace_syscalls;
101         bool                    kernel_syscallchains;
102         bool                    force;
103         bool                    vfs_getname;
104         int                     trace_pgfaults;
105         int                     open_id;
106 };
107
108 struct tp_field {
109         int offset;
110         union {
111                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
112                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
113         };
114 };
115
116 #define TP_UINT_FIELD(bits) \
117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
118 { \
119         u##bits value; \
120         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
121         return value;  \
122 }
123
124 TP_UINT_FIELD(8);
125 TP_UINT_FIELD(16);
126 TP_UINT_FIELD(32);
127 TP_UINT_FIELD(64);
128
129 #define TP_UINT_FIELD__SWAPPED(bits) \
130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
131 { \
132         u##bits value; \
133         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
134         return bswap_##bits(value);\
135 }
136
137 TP_UINT_FIELD__SWAPPED(16);
138 TP_UINT_FIELD__SWAPPED(32);
139 TP_UINT_FIELD__SWAPPED(64);
140
141 static int tp_field__init_uint(struct tp_field *field,
142                                struct format_field *format_field,
143                                bool needs_swap)
144 {
145         field->offset = format_field->offset;
146
147         switch (format_field->size) {
148         case 1:
149                 field->integer = tp_field__u8;
150                 break;
151         case 2:
152                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
153                 break;
154         case 4:
155                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
156                 break;
157         case 8:
158                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
159                 break;
160         default:
161                 return -1;
162         }
163
164         return 0;
165 }
166
167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
168 {
169         return sample->raw_data + field->offset;
170 }
171
172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
173 {
174         field->offset = format_field->offset;
175         field->pointer = tp_field__ptr;
176         return 0;
177 }
178
179 struct syscall_tp {
180         struct tp_field id;
181         union {
182                 struct tp_field args, ret;
183         };
184 };
185
186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
187                                           struct tp_field *field,
188                                           const char *name)
189 {
190         struct format_field *format_field = perf_evsel__field(evsel, name);
191
192         if (format_field == NULL)
193                 return -1;
194
195         return tp_field__init_uint(field, format_field, evsel->needs_swap);
196 }
197
198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
199         ({ struct syscall_tp *sc = evsel->priv;\
200            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
201
202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
203                                          struct tp_field *field,
204                                          const char *name)
205 {
206         struct format_field *format_field = perf_evsel__field(evsel, name);
207
208         if (format_field == NULL)
209                 return -1;
210
211         return tp_field__init_ptr(field, format_field);
212 }
213
214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
215         ({ struct syscall_tp *sc = evsel->priv;\
216            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
217
218 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
219 {
220         zfree(&evsel->priv);
221         perf_evsel__delete(evsel);
222 }
223
224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
225 {
226         evsel->priv = malloc(sizeof(struct syscall_tp));
227         if (evsel->priv != NULL) {
228                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
229                         goto out_delete;
230
231                 evsel->handler = handler;
232                 return 0;
233         }
234
235         return -ENOMEM;
236
237 out_delete:
238         zfree(&evsel->priv);
239         return -ENOENT;
240 }
241
242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
243 {
244         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
245
246         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
247         if (IS_ERR(evsel))
248                 evsel = perf_evsel__newtp("syscalls", direction);
249
250         if (IS_ERR(evsel))
251                 return NULL;
252
253         if (perf_evsel__init_syscall_tp(evsel, handler))
254                 goto out_delete;
255
256         return evsel;
257
258 out_delete:
259         perf_evsel__delete_priv(evsel);
260         return NULL;
261 }
262
263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
264         ({ struct syscall_tp *fields = evsel->priv; \
265            fields->name.integer(&fields->name, sample); })
266
267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
268         ({ struct syscall_tp *fields = evsel->priv; \
269            fields->name.pointer(&fields->name, sample); })
270
271 struct syscall_arg {
272         unsigned long val;
273         struct thread *thread;
274         struct trace  *trace;
275         void          *parm;
276         u8            idx;
277         u8            mask;
278 };
279
280 struct strarray {
281         int         offset;
282         int         nr_entries;
283         const char **entries;
284 };
285
286 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
287         .nr_entries = ARRAY_SIZE(array), \
288         .entries = array, \
289 }
290
291 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
292         .offset     = off, \
293         .nr_entries = ARRAY_SIZE(array), \
294         .entries = array, \
295 }
296
297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
298                                                 const char *intfmt,
299                                                 struct syscall_arg *arg)
300 {
301         struct strarray *sa = arg->parm;
302         int idx = arg->val - sa->offset;
303
304         if (idx < 0 || idx >= sa->nr_entries)
305                 return scnprintf(bf, size, intfmt, arg->val);
306
307         return scnprintf(bf, size, "%s", sa->entries[idx]);
308 }
309
310 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
311                                               struct syscall_arg *arg)
312 {
313         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
314 }
315
316 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
317
318 #if defined(__i386__) || defined(__x86_64__)
319 /*
320  * FIXME: Make this available to all arches as soon as the ioctl beautifier
321  *        gets rewritten to support all arches.
322  */
323 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
324                                                  struct syscall_arg *arg)
325 {
326         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
327 }
328
329 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
330 #endif /* defined(__i386__) || defined(__x86_64__) */
331
332 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
333                                         struct syscall_arg *arg);
334
335 #define SCA_FD syscall_arg__scnprintf_fd
336
337 #ifndef AT_FDCWD
338 #define AT_FDCWD        -100
339 #endif
340
341 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
342                                            struct syscall_arg *arg)
343 {
344         int fd = arg->val;
345
346         if (fd == AT_FDCWD)
347                 return scnprintf(bf, size, "CWD");
348
349         return syscall_arg__scnprintf_fd(bf, size, arg);
350 }
351
352 #define SCA_FDAT syscall_arg__scnprintf_fd_at
353
354 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
355                                               struct syscall_arg *arg);
356
357 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
358
359 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
360                                          struct syscall_arg *arg)
361 {
362         return scnprintf(bf, size, "%#lx", arg->val);
363 }
364
365 #define SCA_HEX syscall_arg__scnprintf_hex
366
367 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
368                                          struct syscall_arg *arg)
369 {
370         return scnprintf(bf, size, "%d", arg->val);
371 }
372
373 #define SCA_INT syscall_arg__scnprintf_int
374
375 static const char *bpf_cmd[] = {
376         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
377         "MAP_GET_NEXT_KEY", "PROG_LOAD",
378 };
379 static DEFINE_STRARRAY(bpf_cmd);
380
381 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
382 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
383
384 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
385 static DEFINE_STRARRAY(itimers);
386
387 static const char *keyctl_options[] = {
388         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
389         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
390         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
391         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
392         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
393 };
394 static DEFINE_STRARRAY(keyctl_options);
395
396 static const char *whences[] = { "SET", "CUR", "END",
397 #ifdef SEEK_DATA
398 "DATA",
399 #endif
400 #ifdef SEEK_HOLE
401 "HOLE",
402 #endif
403 };
404 static DEFINE_STRARRAY(whences);
405
406 static const char *fcntl_cmds[] = {
407         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
408         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
409         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
410         "F_GETOWNER_UIDS",
411 };
412 static DEFINE_STRARRAY(fcntl_cmds);
413
414 static const char *rlimit_resources[] = {
415         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
416         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
417         "RTTIME",
418 };
419 static DEFINE_STRARRAY(rlimit_resources);
420
421 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
422 static DEFINE_STRARRAY(sighow);
423
424 static const char *clockid[] = {
425         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
426         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
427         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
428 };
429 static DEFINE_STRARRAY(clockid);
430
431 static const char *socket_families[] = {
432         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
433         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
434         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
435         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
436         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
437         "ALG", "NFC", "VSOCK",
438 };
439 static DEFINE_STRARRAY(socket_families);
440
441 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
442                                                  struct syscall_arg *arg)
443 {
444         size_t printed = 0;
445         int mode = arg->val;
446
447         if (mode == F_OK) /* 0 */
448                 return scnprintf(bf, size, "F");
449 #define P_MODE(n) \
450         if (mode & n##_OK) { \
451                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
452                 mode &= ~n##_OK; \
453         }
454
455         P_MODE(R);
456         P_MODE(W);
457         P_MODE(X);
458 #undef P_MODE
459
460         if (mode)
461                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
462
463         return printed;
464 }
465
466 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
467
468 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
469                                               struct syscall_arg *arg);
470
471 #define SCA_FILENAME syscall_arg__scnprintf_filename
472
473 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
474                                                 struct syscall_arg *arg)
475 {
476         int printed = 0, flags = arg->val;
477
478 #define P_FLAG(n) \
479         if (flags & O_##n) { \
480                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
481                 flags &= ~O_##n; \
482         }
483
484         P_FLAG(CLOEXEC);
485         P_FLAG(NONBLOCK);
486 #undef P_FLAG
487
488         if (flags)
489                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
490
491         return printed;
492 }
493
494 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
495
496 #if defined(__i386__) || defined(__x86_64__)
497 /*
498  * FIXME: Make this available to all arches.
499  */
500 #define TCGETS          0x5401
501
502 static const char *tioctls[] = {
503         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
504         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
505         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
506         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
507         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
508         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
509         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
510         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
511         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
512         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
513         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
514         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
515         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
516         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
517         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
518 };
519
520 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
521 #endif /* defined(__i386__) || defined(__x86_64__) */
522
523 #ifndef GRND_NONBLOCK
524 #define GRND_NONBLOCK   0x0001
525 #endif
526 #ifndef GRND_RANDOM
527 #define GRND_RANDOM     0x0002
528 #endif
529
530 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
531                                                    struct syscall_arg *arg)
532 {
533         int printed = 0, flags = arg->val;
534
535 #define P_FLAG(n) \
536         if (flags & GRND_##n) { \
537                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
538                 flags &= ~GRND_##n; \
539         }
540
541         P_FLAG(RANDOM);
542         P_FLAG(NONBLOCK);
543 #undef P_FLAG
544
545         if (flags)
546                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
547
548         return printed;
549 }
550
551 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
552
553 #define STRARRAY(arg, name, array) \
554           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
555           .arg_parm      = { [arg] = &strarray__##array, }
556
557 #include "trace/beauty/eventfd.c"
558 #include "trace/beauty/flock.c"
559 #include "trace/beauty/futex_op.c"
560 #include "trace/beauty/mmap.c"
561 #include "trace/beauty/mode_t.c"
562 #include "trace/beauty/msg_flags.c"
563 #include "trace/beauty/open_flags.c"
564 #include "trace/beauty/perf_event_open.c"
565 #include "trace/beauty/pid.c"
566 #include "trace/beauty/sched_policy.c"
567 #include "trace/beauty/seccomp.c"
568 #include "trace/beauty/signum.c"
569 #include "trace/beauty/socket_type.c"
570 #include "trace/beauty/waitid_options.c"
571
572 static struct syscall_fmt {
573         const char *name;
574         const char *alias;
575         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
576         void       *arg_parm[6];
577         bool       errmsg;
578         bool       errpid;
579         bool       timeout;
580         bool       hexret;
581 } syscall_fmts[] = {
582         { .name     = "access",     .errmsg = true,
583           .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
584         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
585         { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
586         { .name     = "brk",        .hexret = true,
587           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
588         { .name     = "chdir",      .errmsg = true, },
589         { .name     = "chmod",      .errmsg = true, },
590         { .name     = "chroot",     .errmsg = true, },
591         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
592         { .name     = "clone",      .errpid = true, },
593         { .name     = "close",      .errmsg = true,
594           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
595         { .name     = "connect",    .errmsg = true, },
596         { .name     = "creat",      .errmsg = true, },
597         { .name     = "dup",        .errmsg = true, },
598         { .name     = "dup2",       .errmsg = true, },
599         { .name     = "dup3",       .errmsg = true, },
600         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
601         { .name     = "eventfd2",   .errmsg = true,
602           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
603         { .name     = "faccessat",  .errmsg = true, },
604         { .name     = "fadvise64",  .errmsg = true, },
605         { .name     = "fallocate",  .errmsg = true, },
606         { .name     = "fchdir",     .errmsg = true, },
607         { .name     = "fchmod",     .errmsg = true, },
608         { .name     = "fchmodat",   .errmsg = true,
609           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
610         { .name     = "fchown",     .errmsg = true, },
611         { .name     = "fchownat",   .errmsg = true,
612           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
613         { .name     = "fcntl",      .errmsg = true,
614           .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
615           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
616         { .name     = "fdatasync",  .errmsg = true, },
617         { .name     = "flock",      .errmsg = true,
618           .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
619         { .name     = "fsetxattr",  .errmsg = true, },
620         { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
621         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
622         { .name     = "fstatfs",    .errmsg = true, },
623         { .name     = "fsync",    .errmsg = true, },
624         { .name     = "ftruncate", .errmsg = true, },
625         { .name     = "futex",      .errmsg = true,
626           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
627         { .name     = "futimesat", .errmsg = true,
628           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
629         { .name     = "getdents",   .errmsg = true, },
630         { .name     = "getdents64", .errmsg = true, },
631         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
632         { .name     = "getpid",     .errpid = true, },
633         { .name     = "getpgid",    .errpid = true, },
634         { .name     = "getppid",    .errpid = true, },
635         { .name     = "getrandom",  .errmsg = true,
636           .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
637         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
638         { .name     = "getxattr",   .errmsg = true, },
639         { .name     = "inotify_add_watch",          .errmsg = true, },
640         { .name     = "ioctl",      .errmsg = true,
641           .arg_scnprintf = {
642 #if defined(__i386__) || defined(__x86_64__)
643 /*
644  * FIXME: Make this available to all arches.
645  */
646                              [1] = SCA_STRHEXARRAY, /* cmd */
647                              [2] = SCA_HEX, /* arg */ },
648           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
649 #else
650                              [2] = SCA_HEX, /* arg */ }, },
651 #endif
652         { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
653         { .name     = "kill",       .errmsg = true,
654           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
655         { .name     = "lchown",    .errmsg = true, },
656         { .name     = "lgetxattr",  .errmsg = true, },
657         { .name     = "linkat",     .errmsg = true,
658           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
659         { .name     = "listxattr",  .errmsg = true, },
660         { .name     = "llistxattr", .errmsg = true, },
661         { .name     = "lremovexattr",  .errmsg = true, },
662         { .name     = "lseek",      .errmsg = true,
663           .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
664           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
665         { .name     = "lsetxattr",  .errmsg = true, },
666         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
667         { .name     = "lsxattr",    .errmsg = true, },
668         { .name     = "madvise",    .errmsg = true,
669           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
670                              [2] = SCA_MADV_BHV, /* behavior */ }, },
671         { .name     = "mkdir",    .errmsg = true, },
672         { .name     = "mkdirat",    .errmsg = true,
673           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
674         { .name     = "mknod",      .errmsg = true, },
675         { .name     = "mknodat",    .errmsg = true,
676           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
677         { .name     = "mlock",      .errmsg = true,
678           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
679         { .name     = "mlockall",   .errmsg = true,
680           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
681         { .name     = "mmap",       .hexret = true,
682 /* The standard mmap maps to old_mmap on s390x */
683 #if defined(__s390x__)
684         .alias = "old_mmap",
685 #endif
686           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
687                              [2] = SCA_MMAP_PROT, /* prot */
688                              [3] = SCA_MMAP_FLAGS, /* flags */ }, },
689         { .name     = "mprotect",   .errmsg = true,
690           .arg_scnprintf = { [0] = SCA_HEX, /* start */
691                              [2] = SCA_MMAP_PROT, /* prot */ }, },
692         { .name     = "mq_unlink", .errmsg = true,
693           .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
694         { .name     = "mremap",     .hexret = true,
695           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
696                              [3] = SCA_MREMAP_FLAGS, /* flags */
697                              [4] = SCA_HEX, /* new_addr */ }, },
698         { .name     = "munlock",    .errmsg = true,
699           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
700         { .name     = "munmap",     .errmsg = true,
701           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
702         { .name     = "name_to_handle_at", .errmsg = true,
703           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
704         { .name     = "newfstatat", .errmsg = true,
705           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
706         { .name     = "open",       .errmsg = true,
707           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
708         { .name     = "open_by_handle_at", .errmsg = true,
709           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
710                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
711         { .name     = "openat",     .errmsg = true,
712           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
713                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
714         { .name     = "perf_event_open", .errmsg = true,
715           .arg_scnprintf = { [2] = SCA_INT, /* cpu */
716                              [3] = SCA_FD,  /* group_fd */
717                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
718         { .name     = "pipe2",      .errmsg = true,
719           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
720         { .name     = "poll",       .errmsg = true, .timeout = true, },
721         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
722         { .name     = "pread",      .errmsg = true, .alias = "pread64", },
723         { .name     = "preadv",     .errmsg = true, .alias = "pread", },
724         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
725         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
726         { .name     = "pwritev",    .errmsg = true, },
727         { .name     = "read",       .errmsg = true, },
728         { .name     = "readlink",   .errmsg = true, },
729         { .name     = "readlinkat", .errmsg = true,
730           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
731         { .name     = "readv",      .errmsg = true, },
732         { .name     = "recvfrom",   .errmsg = true,
733           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
734         { .name     = "recvmmsg",   .errmsg = true,
735           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
736         { .name     = "recvmsg",    .errmsg = true,
737           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
738         { .name     = "removexattr", .errmsg = true, },
739         { .name     = "renameat",   .errmsg = true,
740           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
741         { .name     = "rmdir",    .errmsg = true, },
742         { .name     = "rt_sigaction", .errmsg = true,
743           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
744         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
745         { .name     = "rt_sigqueueinfo", .errmsg = true,
746           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
747         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
748           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
749         { .name     = "sched_getattr",        .errmsg = true, },
750         { .name     = "sched_setattr",        .errmsg = true, },
751         { .name     = "sched_setscheduler",   .errmsg = true,
752           .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
753         { .name     = "seccomp", .errmsg = true,
754           .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
755                              [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
756         { .name     = "select",     .errmsg = true, .timeout = true, },
757         { .name     = "sendmmsg",    .errmsg = true,
758           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
759         { .name     = "sendmsg",    .errmsg = true,
760           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
761         { .name     = "sendto",     .errmsg = true,
762           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
763         { .name     = "set_tid_address", .errpid = true, },
764         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
765         { .name     = "setpgid",    .errmsg = true, },
766         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
767         { .name     = "setxattr",   .errmsg = true, },
768         { .name     = "shutdown",   .errmsg = true, },
769         { .name     = "socket",     .errmsg = true,
770           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
771                              [1] = SCA_SK_TYPE, /* type */ },
772           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
773         { .name     = "socketpair", .errmsg = true,
774           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
775                              [1] = SCA_SK_TYPE, /* type */ },
776           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
777         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
778         { .name     = "statfs",     .errmsg = true, },
779         { .name     = "swapoff",    .errmsg = true,
780           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
781         { .name     = "swapon",     .errmsg = true,
782           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
783         { .name     = "symlinkat",  .errmsg = true,
784           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
785         { .name     = "tgkill",     .errmsg = true,
786           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
787         { .name     = "tkill",      .errmsg = true,
788           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
789         { .name     = "truncate",   .errmsg = true, },
790         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
791         { .name     = "unlinkat",   .errmsg = true,
792           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
793         { .name     = "utime",  .errmsg = true, },
794         { .name     = "utimensat",  .errmsg = true,
795           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
796         { .name     = "utimes",  .errmsg = true, },
797         { .name     = "vmsplice",  .errmsg = true, },
798         { .name     = "wait4",      .errpid = true,
799           .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
800         { .name     = "waitid",     .errpid = true,
801           .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
802         { .name     = "write",      .errmsg = true, },
803         { .name     = "writev",     .errmsg = true, },
804 };
805
806 static int syscall_fmt__cmp(const void *name, const void *fmtp)
807 {
808         const struct syscall_fmt *fmt = fmtp;
809         return strcmp(name, fmt->name);
810 }
811
812 static struct syscall_fmt *syscall_fmt__find(const char *name)
813 {
814         const int nmemb = ARRAY_SIZE(syscall_fmts);
815         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
816 }
817
818 struct syscall {
819         struct event_format *tp_format;
820         int                 nr_args;
821         struct format_field *args;
822         const char          *name;
823         bool                is_exit;
824         struct syscall_fmt  *fmt;
825         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
826         void                **arg_parm;
827 };
828
829 /*
830  * We need to have this 'calculated' boolean because in some cases we really
831  * don't know what is the duration of a syscall, for instance, when we start
832  * a session and some threads are waiting for a syscall to finish, say 'poll',
833  * in which case all we can do is to print "( ? ) for duration and for the
834  * start timestamp.
835  */
836 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
837 {
838         double duration = (double)t / NSEC_PER_MSEC;
839         size_t printed = fprintf(fp, "(");
840
841         if (!calculated)
842                 printed += fprintf(fp, "     ?   ");
843         else if (duration >= 1.0)
844                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
845         else if (duration >= 0.01)
846                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
847         else
848                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
849         return printed + fprintf(fp, "): ");
850 }
851
852 /**
853  * filename.ptr: The filename char pointer that will be vfs_getname'd
854  * filename.entry_str_pos: Where to insert the string translated from
855  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
856  */
857 struct thread_trace {
858         u64               entry_time;
859         u64               exit_time;
860         bool              entry_pending;
861         unsigned long     nr_events;
862         unsigned long     pfmaj, pfmin;
863         char              *entry_str;
864         double            runtime_ms;
865         struct {
866                 unsigned long ptr;
867                 short int     entry_str_pos;
868                 bool          pending_open;
869                 unsigned int  namelen;
870                 char          *name;
871         } filename;
872         struct {
873                 int       max;
874                 char      **table;
875         } paths;
876
877         struct intlist *syscall_stats;
878 };
879
880 static struct thread_trace *thread_trace__new(void)
881 {
882         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
883
884         if (ttrace)
885                 ttrace->paths.max = -1;
886
887         ttrace->syscall_stats = intlist__new(NULL);
888
889         return ttrace;
890 }
891
892 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
893 {
894         struct thread_trace *ttrace;
895
896         if (thread == NULL)
897                 goto fail;
898
899         if (thread__priv(thread) == NULL)
900                 thread__set_priv(thread, thread_trace__new());
901
902         if (thread__priv(thread) == NULL)
903                 goto fail;
904
905         ttrace = thread__priv(thread);
906         ++ttrace->nr_events;
907
908         return ttrace;
909 fail:
910         color_fprintf(fp, PERF_COLOR_RED,
911                       "WARNING: not enough memory, dropping samples!\n");
912         return NULL;
913 }
914
915 #define TRACE_PFMAJ             (1 << 0)
916 #define TRACE_PFMIN             (1 << 1)
917
918 static const size_t trace__entry_str_size = 2048;
919
920 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
921 {
922         struct thread_trace *ttrace = thread__priv(thread);
923
924         if (fd > ttrace->paths.max) {
925                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
926
927                 if (npath == NULL)
928                         return -1;
929
930                 if (ttrace->paths.max != -1) {
931                         memset(npath + ttrace->paths.max + 1, 0,
932                                (fd - ttrace->paths.max) * sizeof(char *));
933                 } else {
934                         memset(npath, 0, (fd + 1) * sizeof(char *));
935                 }
936
937                 ttrace->paths.table = npath;
938                 ttrace->paths.max   = fd;
939         }
940
941         ttrace->paths.table[fd] = strdup(pathname);
942
943         return ttrace->paths.table[fd] != NULL ? 0 : -1;
944 }
945
946 static int thread__read_fd_path(struct thread *thread, int fd)
947 {
948         char linkname[PATH_MAX], pathname[PATH_MAX];
949         struct stat st;
950         int ret;
951
952         if (thread->pid_ == thread->tid) {
953                 scnprintf(linkname, sizeof(linkname),
954                           "/proc/%d/fd/%d", thread->pid_, fd);
955         } else {
956                 scnprintf(linkname, sizeof(linkname),
957                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
958         }
959
960         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
961                 return -1;
962
963         ret = readlink(linkname, pathname, sizeof(pathname));
964
965         if (ret < 0 || ret > st.st_size)
966                 return -1;
967
968         pathname[ret] = '\0';
969         return trace__set_fd_pathname(thread, fd, pathname);
970 }
971
972 static const char *thread__fd_path(struct thread *thread, int fd,
973                                    struct trace *trace)
974 {
975         struct thread_trace *ttrace = thread__priv(thread);
976
977         if (ttrace == NULL)
978                 return NULL;
979
980         if (fd < 0)
981                 return NULL;
982
983         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
984                 if (!trace->live)
985                         return NULL;
986                 ++trace->stats.proc_getname;
987                 if (thread__read_fd_path(thread, fd))
988                         return NULL;
989         }
990
991         return ttrace->paths.table[fd];
992 }
993
994 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
995                                         struct syscall_arg *arg)
996 {
997         int fd = arg->val;
998         size_t printed = scnprintf(bf, size, "%d", fd);
999         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1000
1001         if (path)
1002                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1003
1004         return printed;
1005 }
1006
1007 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1008                                               struct syscall_arg *arg)
1009 {
1010         int fd = arg->val;
1011         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1012         struct thread_trace *ttrace = thread__priv(arg->thread);
1013
1014         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1015                 zfree(&ttrace->paths.table[fd]);
1016
1017         return printed;
1018 }
1019
1020 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1021                                      unsigned long ptr)
1022 {
1023         struct thread_trace *ttrace = thread__priv(thread);
1024
1025         ttrace->filename.ptr = ptr;
1026         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1027 }
1028
1029 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1030                                               struct syscall_arg *arg)
1031 {
1032         unsigned long ptr = arg->val;
1033
1034         if (!arg->trace->vfs_getname)
1035                 return scnprintf(bf, size, "%#x", ptr);
1036
1037         thread__set_filename_pos(arg->thread, bf, ptr);
1038         return 0;
1039 }
1040
1041 static bool trace__filter_duration(struct trace *trace, double t)
1042 {
1043         return t < (trace->duration_filter * NSEC_PER_MSEC);
1044 }
1045
1046 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1047 {
1048         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1049
1050         return fprintf(fp, "%10.3f ", ts);
1051 }
1052
1053 /*
1054  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1055  * using ttrace->entry_time for a thread that receives a sys_exit without
1056  * first having received a sys_enter ("poll" issued before tracing session
1057  * starts, lost sys_enter exit due to ring buffer overflow).
1058  */
1059 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1060 {
1061         if (tstamp > 0)
1062                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1063
1064         return fprintf(fp, "         ? ");
1065 }
1066
1067 static bool done = false;
1068 static bool interrupted = false;
1069
1070 static void sig_handler(int sig)
1071 {
1072         done = true;
1073         interrupted = sig == SIGINT;
1074 }
1075
1076 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1077                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1078 {
1079         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1080         printed += fprintf_duration(duration, duration_calculated, fp);
1081
1082         if (trace->multiple_threads) {
1083                 if (trace->show_comm)
1084                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1085                 printed += fprintf(fp, "%d ", thread->tid);
1086         }
1087
1088         return printed;
1089 }
1090
1091 static int trace__process_event(struct trace *trace, struct machine *machine,
1092                                 union perf_event *event, struct perf_sample *sample)
1093 {
1094         int ret = 0;
1095
1096         switch (event->header.type) {
1097         case PERF_RECORD_LOST:
1098                 color_fprintf(trace->output, PERF_COLOR_RED,
1099                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1100                 ret = machine__process_lost_event(machine, event, sample);
1101                 break;
1102         default:
1103                 ret = machine__process_event(machine, event, sample);
1104                 break;
1105         }
1106
1107         return ret;
1108 }
1109
1110 static int trace__tool_process(struct perf_tool *tool,
1111                                union perf_event *event,
1112                                struct perf_sample *sample,
1113                                struct machine *machine)
1114 {
1115         struct trace *trace = container_of(tool, struct trace, tool);
1116         return trace__process_event(trace, machine, event, sample);
1117 }
1118
1119 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1120 {
1121         struct machine *machine = vmachine;
1122
1123         if (machine->kptr_restrict_warned)
1124                 return NULL;
1125
1126         if (symbol_conf.kptr_restrict) {
1127                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1128                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1129                            "Kernel samples will not be resolved.\n");
1130                 machine->kptr_restrict_warned = true;
1131                 return NULL;
1132         }
1133
1134         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1135 }
1136
1137 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1138 {
1139         int err = symbol__init(NULL);
1140
1141         if (err)
1142                 return err;
1143
1144         trace->host = machine__new_host();
1145         if (trace->host == NULL)
1146                 return -ENOMEM;
1147
1148         if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1149                 return -errno;
1150
1151         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1152                                             evlist->threads, trace__tool_process, false,
1153                                             trace->opts.proc_map_timeout);
1154         if (err)
1155                 symbol__exit();
1156
1157         return err;
1158 }
1159
1160 static int syscall__set_arg_fmts(struct syscall *sc)
1161 {
1162         struct format_field *field;
1163         int idx = 0, len;
1164
1165         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1166         if (sc->arg_scnprintf == NULL)
1167                 return -1;
1168
1169         if (sc->fmt)
1170                 sc->arg_parm = sc->fmt->arg_parm;
1171
1172         for (field = sc->args; field; field = field->next) {
1173                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1174                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1175                 else if (strcmp(field->type, "const char *") == 0 &&
1176                          (strcmp(field->name, "filename") == 0 ||
1177                           strcmp(field->name, "path") == 0 ||
1178                           strcmp(field->name, "pathname") == 0))
1179                         sc->arg_scnprintf[idx] = SCA_FILENAME;
1180                 else if (field->flags & FIELD_IS_POINTER)
1181                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1182                 else if (strcmp(field->type, "pid_t") == 0)
1183                         sc->arg_scnprintf[idx] = SCA_PID;
1184                 else if (strcmp(field->type, "umode_t") == 0)
1185                         sc->arg_scnprintf[idx] = SCA_MODE_T;
1186                 else if ((strcmp(field->type, "int") == 0 ||
1187                           strcmp(field->type, "unsigned int") == 0 ||
1188                           strcmp(field->type, "long") == 0) &&
1189                          (len = strlen(field->name)) >= 2 &&
1190                          strcmp(field->name + len - 2, "fd") == 0) {
1191                         /*
1192                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1193                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1194                          * 65 int
1195                          * 23 unsigned int
1196                          * 7 unsigned long
1197                          */
1198                         sc->arg_scnprintf[idx] = SCA_FD;
1199                 }
1200                 ++idx;
1201         }
1202
1203         return 0;
1204 }
1205
1206 static int trace__read_syscall_info(struct trace *trace, int id)
1207 {
1208         char tp_name[128];
1209         struct syscall *sc;
1210         const char *name = syscalltbl__name(trace->sctbl, id);
1211
1212         if (name == NULL)
1213                 return -1;
1214
1215         if (id > trace->syscalls.max) {
1216                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1217
1218                 if (nsyscalls == NULL)
1219                         return -1;
1220
1221                 if (trace->syscalls.max != -1) {
1222                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1223                                (id - trace->syscalls.max) * sizeof(*sc));
1224                 } else {
1225                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1226                 }
1227
1228                 trace->syscalls.table = nsyscalls;
1229                 trace->syscalls.max   = id;
1230         }
1231
1232         sc = trace->syscalls.table + id;
1233         sc->name = name;
1234
1235         sc->fmt  = syscall_fmt__find(sc->name);
1236
1237         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1238         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1239
1240         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1241                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1242                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1243         }
1244
1245         if (IS_ERR(sc->tp_format))
1246                 return -1;
1247
1248         sc->args = sc->tp_format->format.fields;
1249         sc->nr_args = sc->tp_format->format.nr_fields;
1250         /*
1251          * We need to check and discard the first variable '__syscall_nr'
1252          * or 'nr' that mean the syscall number. It is needless here.
1253          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1254          */
1255         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1256                 sc->args = sc->args->next;
1257                 --sc->nr_args;
1258         }
1259
1260         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1261
1262         return syscall__set_arg_fmts(sc);
1263 }
1264
1265 static int trace__validate_ev_qualifier(struct trace *trace)
1266 {
1267         int err = 0, i;
1268         struct str_node *pos;
1269
1270         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1271         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1272                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1273
1274         if (trace->ev_qualifier_ids.entries == NULL) {
1275                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1276                        trace->output);
1277                 err = -EINVAL;
1278                 goto out;
1279         }
1280
1281         i = 0;
1282
1283         strlist__for_each_entry(pos, trace->ev_qualifier) {
1284                 const char *sc = pos->s;
1285                 int id = syscalltbl__id(trace->sctbl, sc);
1286
1287                 if (id < 0) {
1288                         if (err == 0) {
1289                                 fputs("Error:\tInvalid syscall ", trace->output);
1290                                 err = -EINVAL;
1291                         } else {
1292                                 fputs(", ", trace->output);
1293                         }
1294
1295                         fputs(sc, trace->output);
1296                 }
1297
1298                 trace->ev_qualifier_ids.entries[i++] = id;
1299         }
1300
1301         if (err < 0) {
1302                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1303                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1304                 zfree(&trace->ev_qualifier_ids.entries);
1305                 trace->ev_qualifier_ids.nr = 0;
1306         }
1307 out:
1308         return err;
1309 }
1310
1311 /*
1312  * args is to be interpreted as a series of longs but we need to handle
1313  * 8-byte unaligned accesses. args points to raw_data within the event
1314  * and raw_data is guaranteed to be 8-byte unaligned because it is
1315  * preceded by raw_size which is a u32. So we need to copy args to a temp
1316  * variable to read it. Most notably this avoids extended load instructions
1317  * on unaligned addresses
1318  */
1319
1320 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1321                                       unsigned char *args, struct trace *trace,
1322                                       struct thread *thread)
1323 {
1324         size_t printed = 0;
1325         unsigned char *p;
1326         unsigned long val;
1327
1328         if (sc->args != NULL) {
1329                 struct format_field *field;
1330                 u8 bit = 1;
1331                 struct syscall_arg arg = {
1332                         .idx    = 0,
1333                         .mask   = 0,
1334                         .trace  = trace,
1335                         .thread = thread,
1336                 };
1337
1338                 for (field = sc->args; field;
1339                      field = field->next, ++arg.idx, bit <<= 1) {
1340                         if (arg.mask & bit)
1341                                 continue;
1342
1343                         /* special care for unaligned accesses */
1344                         p = args + sizeof(unsigned long) * arg.idx;
1345                         memcpy(&val, p, sizeof(val));
1346
1347                         /*
1348                          * Suppress this argument if its value is zero and
1349                          * and we don't have a string associated in an
1350                          * strarray for it.
1351                          */
1352                         if (val == 0 &&
1353                             !(sc->arg_scnprintf &&
1354                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1355                               sc->arg_parm[arg.idx]))
1356                                 continue;
1357
1358                         printed += scnprintf(bf + printed, size - printed,
1359                                              "%s%s: ", printed ? ", " : "", field->name);
1360                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1361                                 arg.val = val;
1362                                 if (sc->arg_parm)
1363                                         arg.parm = sc->arg_parm[arg.idx];
1364                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1365                                                                       size - printed, &arg);
1366                         } else {
1367                                 printed += scnprintf(bf + printed, size - printed,
1368                                                      "%ld", val);
1369                         }
1370                 }
1371         } else if (IS_ERR(sc->tp_format)) {
1372                 /*
1373                  * If we managed to read the tracepoint /format file, then we
1374                  * may end up not having any args, like with gettid(), so only
1375                  * print the raw args when we didn't manage to read it.
1376                  */
1377                 int i = 0;
1378
1379                 while (i < 6) {
1380                         /* special care for unaligned accesses */
1381                         p = args + sizeof(unsigned long) * i;
1382                         memcpy(&val, p, sizeof(val));
1383                         printed += scnprintf(bf + printed, size - printed,
1384                                              "%sarg%d: %ld",
1385                                              printed ? ", " : "", i, val);
1386                         ++i;
1387                 }
1388         }
1389
1390         return printed;
1391 }
1392
1393 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1394                                   union perf_event *event,
1395                                   struct perf_sample *sample);
1396
1397 static struct syscall *trace__syscall_info(struct trace *trace,
1398                                            struct perf_evsel *evsel, int id)
1399 {
1400
1401         if (id < 0) {
1402
1403                 /*
1404                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1405                  * before that, leaving at a higher verbosity level till that is
1406                  * explained. Reproduced with plain ftrace with:
1407                  *
1408                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1409                  * grep "NR -1 " /t/trace_pipe
1410                  *
1411                  * After generating some load on the machine.
1412                  */
1413                 if (verbose > 1) {
1414                         static u64 n;
1415                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1416                                 id, perf_evsel__name(evsel), ++n);
1417                 }
1418                 return NULL;
1419         }
1420
1421         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1422             trace__read_syscall_info(trace, id))
1423                 goto out_cant_read;
1424
1425         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1426                 goto out_cant_read;
1427
1428         return &trace->syscalls.table[id];
1429
1430 out_cant_read:
1431         if (verbose) {
1432                 fprintf(trace->output, "Problems reading syscall %d", id);
1433                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1434                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1435                 fputs(" information\n", trace->output);
1436         }
1437         return NULL;
1438 }
1439
1440 static void thread__update_stats(struct thread_trace *ttrace,
1441                                  int id, struct perf_sample *sample)
1442 {
1443         struct int_node *inode;
1444         struct stats *stats;
1445         u64 duration = 0;
1446
1447         inode = intlist__findnew(ttrace->syscall_stats, id);
1448         if (inode == NULL)
1449                 return;
1450
1451         stats = inode->priv;
1452         if (stats == NULL) {
1453                 stats = malloc(sizeof(struct stats));
1454                 if (stats == NULL)
1455                         return;
1456                 init_stats(stats);
1457                 inode->priv = stats;
1458         }
1459
1460         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1461                 duration = sample->time - ttrace->entry_time;
1462
1463         update_stats(stats, duration);
1464 }
1465
1466 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1467 {
1468         struct thread_trace *ttrace;
1469         u64 duration;
1470         size_t printed;
1471
1472         if (trace->current == NULL)
1473                 return 0;
1474
1475         ttrace = thread__priv(trace->current);
1476
1477         if (!ttrace->entry_pending)
1478                 return 0;
1479
1480         duration = sample->time - ttrace->entry_time;
1481
1482         printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1483         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1484         ttrace->entry_pending = false;
1485
1486         return printed;
1487 }
1488
1489 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1490                             union perf_event *event __maybe_unused,
1491                             struct perf_sample *sample)
1492 {
1493         char *msg;
1494         void *args;
1495         size_t printed = 0;
1496         struct thread *thread;
1497         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1498         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1499         struct thread_trace *ttrace;
1500
1501         if (sc == NULL)
1502                 return -1;
1503
1504         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1505         ttrace = thread__trace(thread, trace->output);
1506         if (ttrace == NULL)
1507                 goto out_put;
1508
1509         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1510
1511         if (ttrace->entry_str == NULL) {
1512                 ttrace->entry_str = malloc(trace__entry_str_size);
1513                 if (!ttrace->entry_str)
1514                         goto out_put;
1515         }
1516
1517         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1518                 trace__printf_interrupted_entry(trace, sample);
1519
1520         ttrace->entry_time = sample->time;
1521         msg = ttrace->entry_str;
1522         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1523
1524         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1525                                            args, trace, thread);
1526
1527         if (sc->is_exit) {
1528                 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1529                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1530                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1531                 }
1532         } else {
1533                 ttrace->entry_pending = true;
1534                 /* See trace__vfs_getname & trace__sys_exit */
1535                 ttrace->filename.pending_open = false;
1536         }
1537
1538         if (trace->current != thread) {
1539                 thread__put(trace->current);
1540                 trace->current = thread__get(thread);
1541         }
1542         err = 0;
1543 out_put:
1544         thread__put(thread);
1545         return err;
1546 }
1547
1548 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1549                                     struct perf_sample *sample,
1550                                     struct callchain_cursor *cursor)
1551 {
1552         struct addr_location al;
1553
1554         if (machine__resolve(trace->host, &al, sample) < 0 ||
1555             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1556                 return -1;
1557
1558         return 0;
1559 }
1560
1561 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1562 {
1563         /* TODO: user-configurable print_opts */
1564         const unsigned int print_opts = EVSEL__PRINT_SYM |
1565                                         EVSEL__PRINT_DSO |
1566                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1567
1568         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1569 }
1570
1571 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1572                            union perf_event *event __maybe_unused,
1573                            struct perf_sample *sample)
1574 {
1575         long ret;
1576         u64 duration = 0;
1577         bool duration_calculated = false;
1578         struct thread *thread;
1579         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1580         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1581         struct thread_trace *ttrace;
1582
1583         if (sc == NULL)
1584                 return -1;
1585
1586         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1587         ttrace = thread__trace(thread, trace->output);
1588         if (ttrace == NULL)
1589                 goto out_put;
1590
1591         if (trace->summary)
1592                 thread__update_stats(ttrace, id, sample);
1593
1594         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1595
1596         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1597                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1598                 ttrace->filename.pending_open = false;
1599                 ++trace->stats.vfs_getname;
1600         }
1601
1602         ttrace->exit_time = sample->time;
1603
1604         if (ttrace->entry_time) {
1605                 duration = sample->time - ttrace->entry_time;
1606                 if (trace__filter_duration(trace, duration))
1607                         goto out;
1608                 duration_calculated = true;
1609         } else if (trace->duration_filter)
1610                 goto out;
1611
1612         if (sample->callchain) {
1613                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1614                 if (callchain_ret == 0) {
1615                         if (callchain_cursor.nr < trace->min_stack)
1616                                 goto out;
1617                         callchain_ret = 1;
1618                 }
1619         }
1620
1621         if (trace->summary_only)
1622                 goto out;
1623
1624         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1625
1626         if (ttrace->entry_pending) {
1627                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1628         } else {
1629                 fprintf(trace->output, " ... [");
1630                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1631                 fprintf(trace->output, "]: %s()", sc->name);
1632         }
1633
1634         if (sc->fmt == NULL) {
1635 signed_print:
1636                 fprintf(trace->output, ") = %ld", ret);
1637         } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1638                 char bf[STRERR_BUFSIZE];
1639                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1640                            *e = audit_errno_to_name(-ret);
1641
1642                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1643         } else if (ret == 0 && sc->fmt->timeout)
1644                 fprintf(trace->output, ") = 0 Timeout");
1645         else if (sc->fmt->hexret)
1646                 fprintf(trace->output, ") = %#lx", ret);
1647         else if (sc->fmt->errpid) {
1648                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1649
1650                 if (child != NULL) {
1651                         fprintf(trace->output, ") = %ld", ret);
1652                         if (child->comm_set)
1653                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1654                         thread__put(child);
1655                 }
1656         } else
1657                 goto signed_print;
1658
1659         fputc('\n', trace->output);
1660
1661         if (callchain_ret > 0)
1662                 trace__fprintf_callchain(trace, sample);
1663         else if (callchain_ret < 0)
1664                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1665 out:
1666         ttrace->entry_pending = false;
1667         err = 0;
1668 out_put:
1669         thread__put(thread);
1670         return err;
1671 }
1672
1673 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1674                               union perf_event *event __maybe_unused,
1675                               struct perf_sample *sample)
1676 {
1677         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1678         struct thread_trace *ttrace;
1679         size_t filename_len, entry_str_len, to_move;
1680         ssize_t remaining_space;
1681         char *pos;
1682         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1683
1684         if (!thread)
1685                 goto out;
1686
1687         ttrace = thread__priv(thread);
1688         if (!ttrace)
1689                 goto out;
1690
1691         filename_len = strlen(filename);
1692
1693         if (ttrace->filename.namelen < filename_len) {
1694                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1695
1696                 if (f == NULL)
1697                                 goto out;
1698
1699                 ttrace->filename.namelen = filename_len;
1700                 ttrace->filename.name = f;
1701         }
1702
1703         strcpy(ttrace->filename.name, filename);
1704         ttrace->filename.pending_open = true;
1705
1706         if (!ttrace->filename.ptr)
1707                 goto out;
1708
1709         entry_str_len = strlen(ttrace->entry_str);
1710         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1711         if (remaining_space <= 0)
1712                 goto out;
1713
1714         if (filename_len > (size_t)remaining_space) {
1715                 filename += filename_len - remaining_space;
1716                 filename_len = remaining_space;
1717         }
1718
1719         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1720         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1721         memmove(pos + filename_len, pos, to_move);
1722         memcpy(pos, filename, filename_len);
1723
1724         ttrace->filename.ptr = 0;
1725         ttrace->filename.entry_str_pos = 0;
1726 out:
1727         return 0;
1728 }
1729
1730 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1731                                      union perf_event *event __maybe_unused,
1732                                      struct perf_sample *sample)
1733 {
1734         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1735         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1736         struct thread *thread = machine__findnew_thread(trace->host,
1737                                                         sample->pid,
1738                                                         sample->tid);
1739         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1740
1741         if (ttrace == NULL)
1742                 goto out_dump;
1743
1744         ttrace->runtime_ms += runtime_ms;
1745         trace->runtime_ms += runtime_ms;
1746         thread__put(thread);
1747         return 0;
1748
1749 out_dump:
1750         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1751                evsel->name,
1752                perf_evsel__strval(evsel, sample, "comm"),
1753                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1754                runtime,
1755                perf_evsel__intval(evsel, sample, "vruntime"));
1756         thread__put(thread);
1757         return 0;
1758 }
1759
1760 static void bpf_output__printer(enum binary_printer_ops op,
1761                                 unsigned int val, void *extra)
1762 {
1763         FILE *output = extra;
1764         unsigned char ch = (unsigned char)val;
1765
1766         switch (op) {
1767         case BINARY_PRINT_CHAR_DATA:
1768                 fprintf(output, "%c", isprint(ch) ? ch : '.');
1769                 break;
1770         case BINARY_PRINT_DATA_BEGIN:
1771         case BINARY_PRINT_LINE_BEGIN:
1772         case BINARY_PRINT_ADDR:
1773         case BINARY_PRINT_NUM_DATA:
1774         case BINARY_PRINT_NUM_PAD:
1775         case BINARY_PRINT_SEP:
1776         case BINARY_PRINT_CHAR_PAD:
1777         case BINARY_PRINT_LINE_END:
1778         case BINARY_PRINT_DATA_END:
1779         default:
1780                 break;
1781         }
1782 }
1783
1784 static void bpf_output__fprintf(struct trace *trace,
1785                                 struct perf_sample *sample)
1786 {
1787         print_binary(sample->raw_data, sample->raw_size, 8,
1788                      bpf_output__printer, trace->output);
1789 }
1790
1791 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1792                                 union perf_event *event __maybe_unused,
1793                                 struct perf_sample *sample)
1794 {
1795         int callchain_ret = 0;
1796
1797         if (sample->callchain) {
1798                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1799                 if (callchain_ret == 0) {
1800                         if (callchain_cursor.nr < trace->min_stack)
1801                                 goto out;
1802                         callchain_ret = 1;
1803                 }
1804         }
1805
1806         trace__printf_interrupted_entry(trace, sample);
1807         trace__fprintf_tstamp(trace, sample->time, trace->output);
1808
1809         if (trace->trace_syscalls)
1810                 fprintf(trace->output, "(         ): ");
1811
1812         fprintf(trace->output, "%s:", evsel->name);
1813
1814         if (perf_evsel__is_bpf_output(evsel)) {
1815                 bpf_output__fprintf(trace, sample);
1816         } else if (evsel->tp_format) {
1817                 event_format__fprintf(evsel->tp_format, sample->cpu,
1818                                       sample->raw_data, sample->raw_size,
1819                                       trace->output);
1820         }
1821
1822         fprintf(trace->output, ")\n");
1823
1824         if (callchain_ret > 0)
1825                 trace__fprintf_callchain(trace, sample);
1826         else if (callchain_ret < 0)
1827                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1828 out:
1829         return 0;
1830 }
1831
1832 static void print_location(FILE *f, struct perf_sample *sample,
1833                            struct addr_location *al,
1834                            bool print_dso, bool print_sym)
1835 {
1836
1837         if ((verbose || print_dso) && al->map)
1838                 fprintf(f, "%s@", al->map->dso->long_name);
1839
1840         if ((verbose || print_sym) && al->sym)
1841                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1842                         al->addr - al->sym->start);
1843         else if (al->map)
1844                 fprintf(f, "0x%" PRIx64, al->addr);
1845         else
1846                 fprintf(f, "0x%" PRIx64, sample->addr);
1847 }
1848
1849 static int trace__pgfault(struct trace *trace,
1850                           struct perf_evsel *evsel,
1851                           union perf_event *event __maybe_unused,
1852                           struct perf_sample *sample)
1853 {
1854         struct thread *thread;
1855         struct addr_location al;
1856         char map_type = 'd';
1857         struct thread_trace *ttrace;
1858         int err = -1;
1859         int callchain_ret = 0;
1860
1861         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1862
1863         if (sample->callchain) {
1864                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1865                 if (callchain_ret == 0) {
1866                         if (callchain_cursor.nr < trace->min_stack)
1867                                 goto out_put;
1868                         callchain_ret = 1;
1869                 }
1870         }
1871
1872         ttrace = thread__trace(thread, trace->output);
1873         if (ttrace == NULL)
1874                 goto out_put;
1875
1876         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1877                 ttrace->pfmaj++;
1878         else
1879                 ttrace->pfmin++;
1880
1881         if (trace->summary_only)
1882                 goto out;
1883
1884         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1885                               sample->ip, &al);
1886
1887         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1888
1889         fprintf(trace->output, "%sfault [",
1890                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1891                 "maj" : "min");
1892
1893         print_location(trace->output, sample, &al, false, true);
1894
1895         fprintf(trace->output, "] => ");
1896
1897         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1898                                    sample->addr, &al);
1899
1900         if (!al.map) {
1901                 thread__find_addr_location(thread, sample->cpumode,
1902                                            MAP__FUNCTION, sample->addr, &al);
1903
1904                 if (al.map)
1905                         map_type = 'x';
1906                 else
1907                         map_type = '?';
1908         }
1909
1910         print_location(trace->output, sample, &al, true, false);
1911
1912         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1913
1914         if (callchain_ret > 0)
1915                 trace__fprintf_callchain(trace, sample);
1916         else if (callchain_ret < 0)
1917                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1918 out:
1919         err = 0;
1920 out_put:
1921         thread__put(thread);
1922         return err;
1923 }
1924
1925 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1926 {
1927         if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1928             (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1929                 return false;
1930
1931         if (trace->pid_list || trace->tid_list)
1932                 return true;
1933
1934         return false;
1935 }
1936
1937 static void trace__set_base_time(struct trace *trace,
1938                                  struct perf_evsel *evsel,
1939                                  struct perf_sample *sample)
1940 {
1941         /*
1942          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1943          * and don't use sample->time unconditionally, we may end up having
1944          * some other event in the future without PERF_SAMPLE_TIME for good
1945          * reason, i.e. we may not be interested in its timestamps, just in
1946          * it taking place, picking some piece of information when it
1947          * appears in our event stream (vfs_getname comes to mind).
1948          */
1949         if (trace->base_time == 0 && !trace->full_time &&
1950             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1951                 trace->base_time = sample->time;
1952 }
1953
1954 static int trace__process_sample(struct perf_tool *tool,
1955                                  union perf_event *event,
1956                                  struct perf_sample *sample,
1957                                  struct perf_evsel *evsel,
1958                                  struct machine *machine __maybe_unused)
1959 {
1960         struct trace *trace = container_of(tool, struct trace, tool);
1961         int err = 0;
1962
1963         tracepoint_handler handler = evsel->handler;
1964
1965         if (skip_sample(trace, sample))
1966                 return 0;
1967
1968         trace__set_base_time(trace, evsel, sample);
1969
1970         if (handler) {
1971                 ++trace->nr_events;
1972                 handler(trace, evsel, event, sample);
1973         }
1974
1975         return err;
1976 }
1977
1978 static int parse_target_str(struct trace *trace)
1979 {
1980         if (trace->opts.target.pid) {
1981                 trace->pid_list = intlist__new(trace->opts.target.pid);
1982                 if (trace->pid_list == NULL) {
1983                         pr_err("Error parsing process id string\n");
1984                         return -EINVAL;
1985                 }
1986         }
1987
1988         if (trace->opts.target.tid) {
1989                 trace->tid_list = intlist__new(trace->opts.target.tid);
1990                 if (trace->tid_list == NULL) {
1991                         pr_err("Error parsing thread id string\n");
1992                         return -EINVAL;
1993                 }
1994         }
1995
1996         return 0;
1997 }
1998
1999 static int trace__record(struct trace *trace, int argc, const char **argv)
2000 {
2001         unsigned int rec_argc, i, j;
2002         const char **rec_argv;
2003         const char * const record_args[] = {
2004                 "record",
2005                 "-R",
2006                 "-m", "1024",
2007                 "-c", "1",
2008         };
2009
2010         const char * const sc_args[] = { "-e", };
2011         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2012         const char * const majpf_args[] = { "-e", "major-faults" };
2013         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2014         const char * const minpf_args[] = { "-e", "minor-faults" };
2015         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2016
2017         /* +1 is for the event string below */
2018         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2019                 majpf_args_nr + minpf_args_nr + argc;
2020         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2021
2022         if (rec_argv == NULL)
2023                 return -ENOMEM;
2024
2025         j = 0;
2026         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2027                 rec_argv[j++] = record_args[i];
2028
2029         if (trace->trace_syscalls) {
2030                 for (i = 0; i < sc_args_nr; i++)
2031                         rec_argv[j++] = sc_args[i];
2032
2033                 /* event string may be different for older kernels - e.g., RHEL6 */
2034                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2035                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2036                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2037                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2038                 else {
2039                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2040                         return -1;
2041                 }
2042         }
2043
2044         if (trace->trace_pgfaults & TRACE_PFMAJ)
2045                 for (i = 0; i < majpf_args_nr; i++)
2046                         rec_argv[j++] = majpf_args[i];
2047
2048         if (trace->trace_pgfaults & TRACE_PFMIN)
2049                 for (i = 0; i < minpf_args_nr; i++)
2050                         rec_argv[j++] = minpf_args[i];
2051
2052         for (i = 0; i < (unsigned int)argc; i++)
2053                 rec_argv[j++] = argv[i];
2054
2055         return cmd_record(j, rec_argv, NULL);
2056 }
2057
2058 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2059
2060 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2061 {
2062         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2063
2064         if (IS_ERR(evsel))
2065                 return false;
2066
2067         if (perf_evsel__field(evsel, "pathname") == NULL) {
2068                 perf_evsel__delete(evsel);
2069                 return false;
2070         }
2071
2072         evsel->handler = trace__vfs_getname;
2073         perf_evlist__add(evlist, evsel);
2074         return true;
2075 }
2076
2077 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2078 {
2079         struct perf_evsel *evsel;
2080         struct perf_event_attr attr = {
2081                 .type = PERF_TYPE_SOFTWARE,
2082                 .mmap_data = 1,
2083         };
2084
2085         attr.config = config;
2086         attr.sample_period = 1;
2087
2088         event_attr_init(&attr);
2089
2090         evsel = perf_evsel__new(&attr);
2091         if (evsel)
2092                 evsel->handler = trace__pgfault;
2093
2094         return evsel;
2095 }
2096
2097 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2098 {
2099         const u32 type = event->header.type;
2100         struct perf_evsel *evsel;
2101
2102         if (type != PERF_RECORD_SAMPLE) {
2103                 trace__process_event(trace, trace->host, event, sample);
2104                 return;
2105         }
2106
2107         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2108         if (evsel == NULL) {
2109                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2110                 return;
2111         }
2112
2113         trace__set_base_time(trace, evsel, sample);
2114
2115         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2116             sample->raw_data == NULL) {
2117                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2118                        perf_evsel__name(evsel), sample->tid,
2119                        sample->cpu, sample->raw_size);
2120         } else {
2121                 tracepoint_handler handler = evsel->handler;
2122                 handler(trace, evsel, event, sample);
2123         }
2124 }
2125
2126 static int trace__add_syscall_newtp(struct trace *trace)
2127 {
2128         int ret = -1;
2129         struct perf_evlist *evlist = trace->evlist;
2130         struct perf_evsel *sys_enter, *sys_exit;
2131
2132         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2133         if (sys_enter == NULL)
2134                 goto out;
2135
2136         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2137                 goto out_delete_sys_enter;
2138
2139         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2140         if (sys_exit == NULL)
2141                 goto out_delete_sys_enter;
2142
2143         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2144                 goto out_delete_sys_exit;
2145
2146         perf_evlist__add(evlist, sys_enter);
2147         perf_evlist__add(evlist, sys_exit);
2148
2149         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2150                 /*
2151                  * We're interested only in the user space callchain
2152                  * leading to the syscall, allow overriding that for
2153                  * debugging reasons using --kernel_syscall_callchains
2154                  */
2155                 sys_exit->attr.exclude_callchain_kernel = 1;
2156         }
2157
2158         trace->syscalls.events.sys_enter = sys_enter;
2159         trace->syscalls.events.sys_exit  = sys_exit;
2160
2161         ret = 0;
2162 out:
2163         return ret;
2164
2165 out_delete_sys_exit:
2166         perf_evsel__delete_priv(sys_exit);
2167 out_delete_sys_enter:
2168         perf_evsel__delete_priv(sys_enter);
2169         goto out;
2170 }
2171
2172 static int trace__set_ev_qualifier_filter(struct trace *trace)
2173 {
2174         int err = -1;
2175         struct perf_evsel *sys_exit;
2176         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2177                                                 trace->ev_qualifier_ids.nr,
2178                                                 trace->ev_qualifier_ids.entries);
2179
2180         if (filter == NULL)
2181                 goto out_enomem;
2182
2183         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2184                                           filter)) {
2185                 sys_exit = trace->syscalls.events.sys_exit;
2186                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2187         }
2188
2189         free(filter);
2190 out:
2191         return err;
2192 out_enomem:
2193         errno = ENOMEM;
2194         goto out;
2195 }
2196
2197 static int trace__run(struct trace *trace, int argc, const char **argv)
2198 {
2199         struct perf_evlist *evlist = trace->evlist;
2200         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2201         int err = -1, i;
2202         unsigned long before;
2203         const bool forks = argc > 0;
2204         bool draining = false;
2205
2206         trace->live = true;
2207
2208         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2209                 goto out_error_raw_syscalls;
2210
2211         if (trace->trace_syscalls)
2212                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2213
2214         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2215                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2216                 if (pgfault_maj == NULL)
2217                         goto out_error_mem;
2218                 perf_evlist__add(evlist, pgfault_maj);
2219         }
2220
2221         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2222                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2223                 if (pgfault_min == NULL)
2224                         goto out_error_mem;
2225                 perf_evlist__add(evlist, pgfault_min);
2226         }
2227
2228         if (trace->sched &&
2229             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2230                                    trace__sched_stat_runtime))
2231                 goto out_error_sched_stat_runtime;
2232
2233         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2234         if (err < 0) {
2235                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2236                 goto out_delete_evlist;
2237         }
2238
2239         err = trace__symbols_init(trace, evlist);
2240         if (err < 0) {
2241                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2242                 goto out_delete_evlist;
2243         }
2244
2245         perf_evlist__config(evlist, &trace->opts, NULL);
2246
2247         if (callchain_param.enabled) {
2248                 bool use_identifier = false;
2249
2250                 if (trace->syscalls.events.sys_exit) {
2251                         perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2252                                                      &trace->opts, &callchain_param);
2253                         use_identifier = true;
2254                 }
2255
2256                 if (pgfault_maj) {
2257                         perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2258                         use_identifier = true;
2259                 }
2260
2261                 if (pgfault_min) {
2262                         perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2263                         use_identifier = true;
2264                 }
2265
2266                 if (use_identifier) {
2267                        /*
2268                         * Now we have evsels with different sample_ids, use
2269                         * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2270                         * from a fixed position in each ring buffer record.
2271                         *
2272                         * As of this the changeset introducing this comment, this
2273                         * isn't strictly needed, as the fields that can come before
2274                         * PERF_SAMPLE_ID are all used, but we'll probably disable
2275                         * some of those for things like copying the payload of
2276                         * pointer syscall arguments, and for vfs_getname we don't
2277                         * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2278                         * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2279                         */
2280                         perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2281                         perf_evlist__reset_sample_bit(evlist, ID);
2282                 }
2283         }
2284
2285         signal(SIGCHLD, sig_handler);
2286         signal(SIGINT, sig_handler);
2287
2288         if (forks) {
2289                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2290                                                     argv, false, NULL);
2291                 if (err < 0) {
2292                         fprintf(trace->output, "Couldn't run the workload!\n");
2293                         goto out_delete_evlist;
2294                 }
2295         }
2296
2297         err = perf_evlist__open(evlist);
2298         if (err < 0)
2299                 goto out_error_open;
2300
2301         err = bpf__apply_obj_config();
2302         if (err) {
2303                 char errbuf[BUFSIZ];
2304
2305                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2306                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2307                          errbuf);
2308                 goto out_error_open;
2309         }
2310
2311         /*
2312          * Better not use !target__has_task() here because we need to cover the
2313          * case where no threads were specified in the command line, but a
2314          * workload was, and in that case we will fill in the thread_map when
2315          * we fork the workload in perf_evlist__prepare_workload.
2316          */
2317         if (trace->filter_pids.nr > 0)
2318                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2319         else if (thread_map__pid(evlist->threads, 0) == -1)
2320                 err = perf_evlist__set_filter_pid(evlist, getpid());
2321
2322         if (err < 0)
2323                 goto out_error_mem;
2324
2325         if (trace->ev_qualifier_ids.nr > 0) {
2326                 err = trace__set_ev_qualifier_filter(trace);
2327                 if (err < 0)
2328                         goto out_errno;
2329
2330                 pr_debug("event qualifier tracepoint filter: %s\n",
2331                          trace->syscalls.events.sys_exit->filter);
2332         }
2333
2334         err = perf_evlist__apply_filters(evlist, &evsel);
2335         if (err < 0)
2336                 goto out_error_apply_filters;
2337
2338         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2339         if (err < 0)
2340                 goto out_error_mmap;
2341
2342         if (!target__none(&trace->opts.target))
2343                 perf_evlist__enable(evlist);
2344
2345         if (forks)
2346                 perf_evlist__start_workload(evlist);
2347
2348         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2349                                   evlist->threads->nr > 1 ||
2350                                   perf_evlist__first(evlist)->attr.inherit;
2351 again:
2352         before = trace->nr_events;
2353
2354         for (i = 0; i < evlist->nr_mmaps; i++) {
2355                 union perf_event *event;
2356
2357                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2358                         struct perf_sample sample;
2359
2360                         ++trace->nr_events;
2361
2362                         err = perf_evlist__parse_sample(evlist, event, &sample);
2363                         if (err) {
2364                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2365                                 goto next_event;
2366                         }
2367
2368                         trace__handle_event(trace, event, &sample);
2369 next_event:
2370                         perf_evlist__mmap_consume(evlist, i);
2371
2372                         if (interrupted)
2373                                 goto out_disable;
2374
2375                         if (done && !draining) {
2376                                 perf_evlist__disable(evlist);
2377                                 draining = true;
2378                         }
2379                 }
2380         }
2381
2382         if (trace->nr_events == before) {
2383                 int timeout = done ? 100 : -1;
2384
2385                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2386                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2387                                 draining = true;
2388
2389                         goto again;
2390                 }
2391         } else {
2392                 goto again;
2393         }
2394
2395 out_disable:
2396         thread__zput(trace->current);
2397
2398         perf_evlist__disable(evlist);
2399
2400         if (!err) {
2401                 if (trace->summary)
2402                         trace__fprintf_thread_summary(trace, trace->output);
2403
2404                 if (trace->show_tool_stats) {
2405                         fprintf(trace->output, "Stats:\n "
2406                                                " vfs_getname : %" PRIu64 "\n"
2407                                                " proc_getname: %" PRIu64 "\n",
2408                                 trace->stats.vfs_getname,
2409                                 trace->stats.proc_getname);
2410                 }
2411         }
2412
2413 out_delete_evlist:
2414         perf_evlist__delete(evlist);
2415         trace->evlist = NULL;
2416         trace->live = false;
2417         return err;
2418 {
2419         char errbuf[BUFSIZ];
2420
2421 out_error_sched_stat_runtime:
2422         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2423         goto out_error;
2424
2425 out_error_raw_syscalls:
2426         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2427         goto out_error;
2428
2429 out_error_mmap:
2430         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2431         goto out_error;
2432
2433 out_error_open:
2434         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2435
2436 out_error:
2437         fprintf(trace->output, "%s\n", errbuf);
2438         goto out_delete_evlist;
2439
2440 out_error_apply_filters:
2441         fprintf(trace->output,
2442                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2443                 evsel->filter, perf_evsel__name(evsel), errno,
2444                 str_error_r(errno, errbuf, sizeof(errbuf)));
2445         goto out_delete_evlist;
2446 }
2447 out_error_mem:
2448         fprintf(trace->output, "Not enough memory to run!\n");
2449         goto out_delete_evlist;
2450
2451 out_errno:
2452         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2453         goto out_delete_evlist;
2454 }
2455
2456 static int trace__replay(struct trace *trace)
2457 {
2458         const struct perf_evsel_str_handler handlers[] = {
2459                 { "probe:vfs_getname",       trace__vfs_getname, },
2460         };
2461         struct perf_data_file file = {
2462                 .path  = input_name,
2463                 .mode  = PERF_DATA_MODE_READ,
2464                 .force = trace->force,
2465         };
2466         struct perf_session *session;
2467         struct perf_evsel *evsel;
2468         int err = -1;
2469
2470         trace->tool.sample        = trace__process_sample;
2471         trace->tool.mmap          = perf_event__process_mmap;
2472         trace->tool.mmap2         = perf_event__process_mmap2;
2473         trace->tool.comm          = perf_event__process_comm;
2474         trace->tool.exit          = perf_event__process_exit;
2475         trace->tool.fork          = perf_event__process_fork;
2476         trace->tool.attr          = perf_event__process_attr;
2477         trace->tool.tracing_data = perf_event__process_tracing_data;
2478         trace->tool.build_id      = perf_event__process_build_id;
2479
2480         trace->tool.ordered_events = true;
2481         trace->tool.ordering_requires_timestamps = true;
2482
2483         /* add tid to output */
2484         trace->multiple_threads = true;
2485
2486         session = perf_session__new(&file, false, &trace->tool);
2487         if (session == NULL)
2488                 return -1;
2489
2490         if (symbol__init(&session->header.env) < 0)
2491                 goto out;
2492
2493         trace->host = &session->machines.host;
2494
2495         err = perf_session__set_tracepoints_handlers(session, handlers);
2496         if (err)
2497                 goto out;
2498
2499         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2500                                                      "raw_syscalls:sys_enter");
2501         /* older kernels have syscalls tp versus raw_syscalls */
2502         if (evsel == NULL)
2503                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2504                                                              "syscalls:sys_enter");
2505
2506         if (evsel &&
2507             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2508             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2509                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2510                 goto out;
2511         }
2512
2513         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2514                                                      "raw_syscalls:sys_exit");
2515         if (evsel == NULL)
2516                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2517                                                              "syscalls:sys_exit");
2518         if (evsel &&
2519             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2520             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2521                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2522                 goto out;
2523         }
2524
2525         evlist__for_each_entry(session->evlist, evsel) {
2526                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2527                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2528                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2529                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2530                         evsel->handler = trace__pgfault;
2531         }
2532
2533         err = parse_target_str(trace);
2534         if (err != 0)
2535                 goto out;
2536
2537         setup_pager();
2538
2539         err = perf_session__process_events(session);
2540         if (err)
2541                 pr_err("Failed to process events, error %d", err);
2542
2543         else if (trace->summary)
2544                 trace__fprintf_thread_summary(trace, trace->output);
2545
2546 out:
2547         perf_session__delete(session);
2548
2549         return err;
2550 }
2551
2552 static size_t trace__fprintf_threads_header(FILE *fp)
2553 {
2554         size_t printed;
2555
2556         printed  = fprintf(fp, "\n Summary of events:\n\n");
2557
2558         return printed;
2559 }
2560
2561 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2562         struct stats    *stats;
2563         double          msecs;
2564         int             syscall;
2565 )
2566 {
2567         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2568         struct stats *stats = source->priv;
2569
2570         entry->syscall = source->i;
2571         entry->stats   = stats;
2572         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2573 }
2574
2575 static size_t thread__dump_stats(struct thread_trace *ttrace,
2576                                  struct trace *trace, FILE *fp)
2577 {
2578         size_t printed = 0;
2579         struct syscall *sc;
2580         struct rb_node *nd;
2581         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2582
2583         if (syscall_stats == NULL)
2584                 return 0;
2585
2586         printed += fprintf(fp, "\n");
2587
2588         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2589         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2590         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2591
2592         resort_rb__for_each_entry(nd, syscall_stats) {
2593                 struct stats *stats = syscall_stats_entry->stats;
2594                 if (stats) {
2595                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2596                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2597                         double avg = avg_stats(stats);
2598                         double pct;
2599                         u64 n = (u64) stats->n;
2600
2601                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2602                         avg /= NSEC_PER_MSEC;
2603
2604                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2605                         printed += fprintf(fp, "   %-15s", sc->name);
2606                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2607                                            n, syscall_stats_entry->msecs, min, avg);
2608                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2609                 }
2610         }
2611
2612         resort_rb__delete(syscall_stats);
2613         printed += fprintf(fp, "\n\n");
2614
2615         return printed;
2616 }
2617
2618 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2619 {
2620         size_t printed = 0;
2621         struct thread_trace *ttrace = thread__priv(thread);
2622         double ratio;
2623
2624         if (ttrace == NULL)
2625                 return 0;
2626
2627         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2628
2629         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2630         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2631         printed += fprintf(fp, "%.1f%%", ratio);
2632         if (ttrace->pfmaj)
2633                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2634         if (ttrace->pfmin)
2635                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2636         if (trace->sched)
2637                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2638         else if (fputc('\n', fp) != EOF)
2639                 ++printed;
2640
2641         printed += thread__dump_stats(ttrace, trace, fp);
2642
2643         return printed;
2644 }
2645
2646 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2647 {
2648         return ttrace ? ttrace->nr_events : 0;
2649 }
2650
2651 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2652         struct thread *thread;
2653 )
2654 {
2655         entry->thread = rb_entry(nd, struct thread, rb_node);
2656 }
2657
2658 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2659 {
2660         DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2661         size_t printed = trace__fprintf_threads_header(fp);
2662         struct rb_node *nd;
2663
2664         if (threads == NULL) {
2665                 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2666                 return 0;
2667         }
2668
2669         resort_rb__for_each_entry(nd, threads)
2670                 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2671
2672         resort_rb__delete(threads);
2673
2674         return printed;
2675 }
2676
2677 static int trace__set_duration(const struct option *opt, const char *str,
2678                                int unset __maybe_unused)
2679 {
2680         struct trace *trace = opt->value;
2681
2682         trace->duration_filter = atof(str);
2683         return 0;
2684 }
2685
2686 static int trace__set_filter_pids(const struct option *opt, const char *str,
2687                                   int unset __maybe_unused)
2688 {
2689         int ret = -1;
2690         size_t i;
2691         struct trace *trace = opt->value;
2692         /*
2693          * FIXME: introduce a intarray class, plain parse csv and create a
2694          * { int nr, int entries[] } struct...
2695          */
2696         struct intlist *list = intlist__new(str);
2697
2698         if (list == NULL)
2699                 return -1;
2700
2701         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2702         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2703
2704         if (trace->filter_pids.entries == NULL)
2705                 goto out;
2706
2707         trace->filter_pids.entries[0] = getpid();
2708
2709         for (i = 1; i < trace->filter_pids.nr; ++i)
2710                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2711
2712         intlist__delete(list);
2713         ret = 0;
2714 out:
2715         return ret;
2716 }
2717
2718 static int trace__open_output(struct trace *trace, const char *filename)
2719 {
2720         struct stat st;
2721
2722         if (!stat(filename, &st) && st.st_size) {
2723                 char oldname[PATH_MAX];
2724
2725                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2726                 unlink(oldname);
2727                 rename(filename, oldname);
2728         }
2729
2730         trace->output = fopen(filename, "w");
2731
2732         return trace->output == NULL ? -errno : 0;
2733 }
2734
2735 static int parse_pagefaults(const struct option *opt, const char *str,
2736                             int unset __maybe_unused)
2737 {
2738         int *trace_pgfaults = opt->value;
2739
2740         if (strcmp(str, "all") == 0)
2741                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2742         else if (strcmp(str, "maj") == 0)
2743                 *trace_pgfaults |= TRACE_PFMAJ;
2744         else if (strcmp(str, "min") == 0)
2745                 *trace_pgfaults |= TRACE_PFMIN;
2746         else
2747                 return -1;
2748
2749         return 0;
2750 }
2751
2752 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2753 {
2754         struct perf_evsel *evsel;
2755
2756         evlist__for_each_entry(evlist, evsel)
2757                 evsel->handler = handler;
2758 }
2759
2760 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2761 {
2762         const char *trace_usage[] = {
2763                 "perf trace [<options>] [<command>]",
2764                 "perf trace [<options>] -- <command> [<options>]",
2765                 "perf trace record [<options>] [<command>]",
2766                 "perf trace record [<options>] -- <command> [<options>]",
2767                 NULL
2768         };
2769         struct trace trace = {
2770                 .syscalls = {
2771                         . max = -1,
2772                 },
2773                 .opts = {
2774                         .target = {
2775                                 .uid       = UINT_MAX,
2776                                 .uses_mmap = true,
2777                         },
2778                         .user_freq     = UINT_MAX,
2779                         .user_interval = ULLONG_MAX,
2780                         .no_buffering  = true,
2781                         .mmap_pages    = UINT_MAX,
2782                         .proc_map_timeout  = 500,
2783                 },
2784                 .output = stderr,
2785                 .show_comm = true,
2786                 .trace_syscalls = true,
2787                 .kernel_syscallchains = false,
2788                 .max_stack = UINT_MAX,
2789         };
2790         const char *output_name = NULL;
2791         const char *ev_qualifier_str = NULL;
2792         const struct option trace_options[] = {
2793         OPT_CALLBACK(0, "event", &trace.evlist, "event",
2794                      "event selector. use 'perf list' to list available events",
2795                      parse_events_option),
2796         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2797                     "show the thread COMM next to its id"),
2798         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2799         OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2800         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2801         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2802         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2803                     "trace events on existing process id"),
2804         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2805                     "trace events on existing thread id"),
2806         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2807                      "pids to filter (by the kernel)", trace__set_filter_pids),
2808         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2809                     "system-wide collection from all CPUs"),
2810         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2811                     "list of cpus to monitor"),
2812         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2813                     "child tasks do not inherit counters"),
2814         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2815                      "number of mmap data pages",
2816                      perf_evlist__parse_mmap_pages),
2817         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2818                    "user to profile"),
2819         OPT_CALLBACK(0, "duration", &trace, "float",
2820                      "show only events with duration > N.M ms",
2821                      trace__set_duration),
2822         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2823         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2824         OPT_BOOLEAN('T', "time", &trace.full_time,
2825                     "Show full timestamp, not time relative to first start"),
2826         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2827                     "Show only syscall summary with statistics"),
2828         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2829                     "Show all syscalls and summary with statistics"),
2830         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2831                      "Trace pagefaults", parse_pagefaults, "maj"),
2832         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2833         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2834         OPT_CALLBACK(0, "call-graph", &trace.opts,
2835                      "record_mode[,record_size]", record_callchain_help,
2836                      &record_parse_callchain_opt),
2837         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2838                     "Show the kernel callchains on the syscall exit path"),
2839         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2840                      "Set the minimum stack depth when parsing the callchain, "
2841                      "anything below the specified depth will be ignored."),
2842         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2843                      "Set the maximum stack depth when parsing the callchain, "
2844                      "anything beyond the specified depth will be ignored. "
2845                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2846         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2847                         "per thread proc mmap processing timeout in ms"),
2848         OPT_END()
2849         };
2850         bool __maybe_unused max_stack_user_set = true;
2851         bool mmap_pages_user_set = true;
2852         const char * const trace_subcommands[] = { "record", NULL };
2853         int err;
2854         char bf[BUFSIZ];
2855
2856         signal(SIGSEGV, sighandler_dump_stack);
2857         signal(SIGFPE, sighandler_dump_stack);
2858
2859         trace.evlist = perf_evlist__new();
2860         trace.sctbl = syscalltbl__new();
2861
2862         if (trace.evlist == NULL || trace.sctbl == NULL) {
2863                 pr_err("Not enough memory to run!\n");
2864                 err = -ENOMEM;
2865                 goto out;
2866         }
2867
2868         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2869                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2870
2871         err = bpf__setup_stdout(trace.evlist);
2872         if (err) {
2873                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2874                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2875                 goto out;
2876         }
2877
2878         err = -1;
2879
2880         if (trace.trace_pgfaults) {
2881                 trace.opts.sample_address = true;
2882                 trace.opts.sample_time = true;
2883         }
2884
2885         if (trace.opts.mmap_pages == UINT_MAX)
2886                 mmap_pages_user_set = false;
2887
2888         if (trace.max_stack == UINT_MAX) {
2889                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2890                 max_stack_user_set = false;
2891         }
2892
2893 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2894         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2895                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2896 #endif
2897
2898         if (callchain_param.enabled) {
2899                 if (!mmap_pages_user_set && geteuid() == 0)
2900                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2901
2902                 symbol_conf.use_callchain = true;
2903         }
2904
2905         if (trace.evlist->nr_entries > 0)
2906                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2907
2908         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2909                 return trace__record(&trace, argc-1, &argv[1]);
2910
2911         /* summary_only implies summary option, but don't overwrite summary if set */
2912         if (trace.summary_only)
2913                 trace.summary = trace.summary_only;
2914
2915         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2916             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2917                 pr_err("Please specify something to trace.\n");
2918                 return -1;
2919         }
2920
2921         if (!trace.trace_syscalls && ev_qualifier_str) {
2922                 pr_err("The -e option can't be used with --no-syscalls.\n");
2923                 goto out;
2924         }
2925
2926         if (output_name != NULL) {
2927                 err = trace__open_output(&trace, output_name);
2928                 if (err < 0) {
2929                         perror("failed to create output file");
2930                         goto out;
2931                 }
2932         }
2933
2934         trace.open_id = syscalltbl__id(trace.sctbl, "open");
2935
2936         if (ev_qualifier_str != NULL) {
2937                 const char *s = ev_qualifier_str;
2938                 struct strlist_config slist_config = {
2939                         .dirname = system_path(STRACE_GROUPS_DIR),
2940                 };
2941
2942                 trace.not_ev_qualifier = *s == '!';
2943                 if (trace.not_ev_qualifier)
2944                         ++s;
2945                 trace.ev_qualifier = strlist__new(s, &slist_config);
2946                 if (trace.ev_qualifier == NULL) {
2947                         fputs("Not enough memory to parse event qualifier",
2948                               trace.output);
2949                         err = -ENOMEM;
2950                         goto out_close;
2951                 }
2952
2953                 err = trace__validate_ev_qualifier(&trace);
2954                 if (err)
2955                         goto out_close;
2956         }
2957
2958         err = target__validate(&trace.opts.target);
2959         if (err) {
2960                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2961                 fprintf(trace.output, "%s", bf);
2962                 goto out_close;
2963         }
2964
2965         err = target__parse_uid(&trace.opts.target);
2966         if (err) {
2967                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2968                 fprintf(trace.output, "%s", bf);
2969                 goto out_close;
2970         }
2971
2972         if (!argc && target__none(&trace.opts.target))
2973                 trace.opts.target.system_wide = true;
2974
2975         if (input_name)
2976                 err = trace__replay(&trace);
2977         else
2978                 err = trace__run(&trace, argc, argv);
2979
2980 out_close:
2981         if (output_name != NULL)
2982                 fclose(trace.output);
2983 out:
2984         return err;
2985 }