Merge tag 'trace-v5.12-2' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt...
[linux-2.6-microblaze.git] / tools / tracing / latency / latency-collector.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2017, 2018, 2019, 2021 BMW Car IT GmbH
4  * Author: Viktor Rosendahl (viktor.rosendahl@bmw.de)
5  */
6
7 #define _GNU_SOURCE
8 #define _POSIX_C_SOURCE 200809L
9
10 #include <ctype.h>
11 #include <stdbool.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15
16 #include <err.h>
17 #include <errno.h>
18 #include <fcntl.h>
19 #include <getopt.h>
20 #include <sched.h>
21 #include <linux/unistd.h>
22 #include <signal.h>
23 #include <sys/inotify.h>
24 #include <unistd.h>
25 #include <pthread.h>
26 #include <tracefs.h>
27
28 static const char *prg_name;
29 static const char *prg_unknown = "unknown program name";
30
31 static int fd_stdout;
32
33 static int sched_policy;
34 static bool sched_policy_set;
35
36 static int sched_pri;
37 static bool sched_pri_set;
38
39 static bool trace_enable = true;
40 static bool setup_ftrace = true;
41 static bool use_random_sleep;
42
43 #define TRACE_OPTS                              \
44         C(FUNC_TR, "function-trace"),           \
45         C(DISP_GR, "display-graph"),            \
46         C(NR,       NULL)
47
48 #undef C
49 #define C(a, b) OPTIDX_##a
50
51 enum traceopt {
52         TRACE_OPTS
53 };
54
55 #undef C
56 #define C(a, b)  b
57
58 static const char *const optstr[] = {
59         TRACE_OPTS
60 };
61
62 enum errhandling {
63         ERR_EXIT = 0,
64         ERR_WARN,
65         ERR_CLEANUP,
66 };
67
68 static bool use_options[OPTIDX_NR];
69
70 static char inotify_buffer[655360];
71
72 #define likely(x)      __builtin_expect(!!(x), 1)
73 #define unlikely(x)    __builtin_expect(!!(x), 0)
74 #define bool2str(x)    (x ? "true":"false")
75
76 #define DEFAULT_NR_PRINTER_THREADS (3)
77 static unsigned int nr_threads = DEFAULT_NR_PRINTER_THREADS;
78
79 #define DEFAULT_TABLE_SIZE (2)
80 static unsigned int table_startsize = DEFAULT_TABLE_SIZE;
81
82 static int verbosity;
83
84 #define verbose_sizechange() (verbosity >= 1)
85 #define verbose_lostevent()  (verbosity >= 2)
86 #define verbose_ftrace()     (verbosity >= 1)
87
88 #define was_changed(ORIG, CUR) (strcmp(ORIG, CUR) != 0)
89 #define needs_change(CUR, WANTED) (strcmp(CUR, WANTED) != 0)
90
91 static const char *debug_tracefile;
92 static const char *debug_tracefile_dflt;
93 static const char *debug_maxlat;
94 static const char *debug_maxlat_dflt;
95 static const char * const DEBUG_NOFILE = "[file not found]";
96
97 static const char * const TR_MAXLAT  = "tracing_max_latency";
98 static const char * const TR_THRESH  = "tracing_thresh";
99 static const char * const TR_CURRENT = "current_tracer";
100 static const char * const TR_OPTIONS = "trace_options";
101
102 static const char * const NOP_TRACER = "nop";
103
104 static const char * const OPT_NO_PREFIX = "no";
105
106 #define DFLT_THRESHOLD_US "0"
107 static const char *threshold = DFLT_THRESHOLD_US;
108
109 #define DEV_URANDOM     "/dev/urandom"
110 #define RT_DEFAULT_PRI (99)
111 #define DEFAULT_PRI    (0)
112
113 #define USEC_PER_MSEC (1000L)
114 #define NSEC_PER_USEC (1000L)
115 #define NSEC_PER_MSEC (USEC_PER_MSEC * NSEC_PER_USEC)
116
117 #define MSEC_PER_SEC (1000L)
118 #define USEC_PER_SEC (USEC_PER_MSEC * MSEC_PER_SEC)
119 #define NSEC_PER_SEC (NSEC_PER_MSEC * MSEC_PER_SEC)
120
121 #define SLEEP_TIME_MS_DEFAULT (1000L)
122 #define TRY_PRINTMUTEX_MS (1000)
123
124 static long sleep_time = (USEC_PER_MSEC * SLEEP_TIME_MS_DEFAULT);
125
126 static const char * const queue_full_warning =
127 "Could not queue trace for printing. It is likely that events happen faster\n"
128 "than what they can be printed. Probably partly because of random sleeping\n";
129
130 static const char * const no_tracer_msg =
131 "Could not find any tracers! Running this program as root may help!\n";
132
133 static const char * const no_latency_tr_msg =
134 "No latency tracers are supported by your kernel!\n";
135
136 struct policy {
137         const char *name;
138         int policy;
139         int default_pri;
140 };
141
142 static const struct policy policies[] = {
143         { "other", SCHED_OTHER, DEFAULT_PRI    },
144         { "batch", SCHED_BATCH, DEFAULT_PRI    },
145         { "idle",  SCHED_IDLE,  DEFAULT_PRI    },
146         { "rr",    SCHED_RR,    RT_DEFAULT_PRI },
147         { "fifo",  SCHED_FIFO,  RT_DEFAULT_PRI },
148         { NULL,    0,           DEFAULT_PRI    }
149 };
150
151 /*
152  * The default tracer will be the first on this list that is supported by the
153  * currently running Linux kernel.
154  */
155 static const char * const relevant_tracers[] = {
156         "preemptirqsoff",
157         "preemptoff",
158         "irqsoff",
159         "wakeup",
160         "wakeup_rt",
161         "wakeup_dl",
162         NULL
163 };
164
165 /* This is the list of tracers for which random sleep makes sense */
166 static const char * const random_tracers[] = {
167         "preemptirqsoff",
168         "preemptoff",
169         "irqsoff",
170         NULL
171 };
172
173 static const char *current_tracer;
174 static bool force_tracer;
175
176 struct ftrace_state {
177         char *tracer;
178         char *thresh;
179         bool opt[OPTIDX_NR];
180         bool opt_valid[OPTIDX_NR];
181         pthread_mutex_t mutex;
182 };
183
184 struct entry {
185         int ticket;
186         int ticket_completed_ref;
187 };
188
189 struct print_state {
190         int ticket_counter;
191         int ticket_completed;
192         pthread_mutex_t mutex;
193         pthread_cond_t cond;
194         int cnt;
195         pthread_mutex_t cnt_mutex;
196 };
197
198 struct short_msg {
199         char buf[160];
200         int len;
201 };
202
203 static struct print_state printstate;
204 static struct ftrace_state save_state;
205 volatile sig_atomic_t signal_flag;
206
207 #define PROB_TABLE_MAX_SIZE (1000)
208
209 int probabilities[PROB_TABLE_MAX_SIZE];
210
211 struct sleep_table {
212         int *table;
213         int size;
214         pthread_mutex_t mutex;
215 };
216
217 static struct sleep_table sleeptable;
218
219 #define QUEUE_SIZE (10)
220
221 struct queue {
222         struct entry entries[QUEUE_SIZE];
223         int next_prod_idx;
224         int next_cons_idx;
225         pthread_mutex_t mutex;
226         pthread_cond_t cond;
227 };
228
229 #define MAX_THREADS (40)
230
231 struct queue printqueue;
232 pthread_t printthread[MAX_THREADS];
233 pthread_mutex_t print_mtx;
234 #define PRINT_BUFFER_SIZE (16 * 1024 * 1024)
235
236 static void cleanup_exit(int status);
237 static int set_trace_opt(const char *opt, bool value);
238
239 static __always_inline void *malloc_or_die(size_t size)
240 {
241         void *ptr = malloc(size);
242
243         if (unlikely(ptr == NULL)) {
244                 warn("malloc() failed");
245                 cleanup_exit(EXIT_FAILURE);
246         }
247         return ptr;
248 }
249
250 static __always_inline void *malloc_or_die_nocleanup(size_t size)
251 {
252         void *ptr = malloc(size);
253
254         if (unlikely(ptr == NULL))
255                 err(0, "malloc() failed");
256         return ptr;
257 }
258
259 static __always_inline void write_or_die(int fd, const char *buf, size_t count)
260 {
261         ssize_t r;
262
263         do {
264                 r = write(fd, buf, count);
265                 if (unlikely(r < 0)) {
266                         if (errno == EINTR)
267                                 continue;
268                         warn("write() failed");
269                         cleanup_exit(EXIT_FAILURE);
270                 }
271                 count -= r;
272                 buf += r;
273         } while (count > 0);
274 }
275
276 static __always_inline void clock_gettime_or_die(clockid_t clk_id,
277                                                  struct timespec *tp)
278 {
279         int r = clock_gettime(clk_id, tp);
280
281         if (unlikely(r != 0))
282                 err(EXIT_FAILURE, "clock_gettime() failed");
283 }
284
285 static __always_inline void sigemptyset_or_die(sigset_t *s)
286 {
287         if (unlikely(sigemptyset(s) != 0)) {
288                 warn("sigemptyset() failed");
289                 cleanup_exit(EXIT_FAILURE);
290         }
291 }
292
293 static __always_inline void sigaddset_or_die(sigset_t *s, int signum)
294 {
295         if (unlikely(sigaddset(s, signum) != 0)) {
296                 warn("sigemptyset() failed");
297                 cleanup_exit(EXIT_FAILURE);
298         }
299 }
300
301 static __always_inline void sigaction_or_die(int signum,
302                                              const struct sigaction *act,
303                                              struct sigaction *oldact)
304 {
305         if (unlikely(sigaction(signum, act, oldact) != 0)) {
306                 warn("sigaction() failed");
307                 cleanup_exit(EXIT_FAILURE);
308         }
309 }
310
311 static void open_stdout(void)
312 {
313         if (setvbuf(stdout, NULL, _IONBF, 0) != 0)
314                 err(EXIT_FAILURE, "setvbuf() failed");
315         fd_stdout = fileno(stdout);
316         if (fd_stdout < 0)
317                 err(EXIT_FAILURE, "fileno() failed");
318 }
319
320 /*
321  * It's not worth it to call cleanup_exit() from mutex functions because
322  * cleanup_exit() uses mutexes.
323  */
324 static __always_inline void mutex_lock(pthread_mutex_t *mtx)
325 {
326         errno = pthread_mutex_lock(mtx);
327         if (unlikely(errno))
328                 err(EXIT_FAILURE, "pthread_mutex_lock() failed");
329 }
330
331
332 static __always_inline void mutex_unlock(pthread_mutex_t *mtx)
333 {
334         errno = pthread_mutex_unlock(mtx);
335         if (unlikely(errno))
336                 err(EXIT_FAILURE, "pthread_mutex_unlock() failed");
337 }
338
339 static __always_inline void cond_signal(pthread_cond_t *cond)
340 {
341         errno = pthread_cond_signal(cond);
342         if (unlikely(errno))
343                 err(EXIT_FAILURE, "pthread_cond_signal() failed");
344 }
345
346 static __always_inline void cond_wait(pthread_cond_t *restrict cond,
347                                       pthread_mutex_t *restrict mutex)
348 {
349         errno = pthread_cond_wait(cond, mutex);
350         if (unlikely(errno))
351                 err(EXIT_FAILURE, "pthread_cond_wait() failed");
352 }
353
354 static __always_inline void cond_broadcast(pthread_cond_t *cond)
355 {
356         errno = pthread_cond_broadcast(cond);
357         if (unlikely(errno))
358                 err(EXIT_FAILURE, "pthread_cond_broadcast() failed");
359 }
360
361 static __always_inline void
362 mutex_init(pthread_mutex_t *mutex,
363            const pthread_mutexattr_t *attr)
364 {
365         errno = pthread_mutex_init(mutex, attr);
366         if (errno)
367                 err(EXIT_FAILURE, "pthread_mutex_init() failed");
368 }
369
370 static __always_inline void mutexattr_init(pthread_mutexattr_t *attr)
371 {
372         errno = pthread_mutexattr_init(attr);
373         if (errno)
374                 err(EXIT_FAILURE, "pthread_mutexattr_init() failed");
375 }
376
377 static __always_inline void mutexattr_destroy(pthread_mutexattr_t *attr)
378 {
379         errno = pthread_mutexattr_destroy(attr);
380         if (errno)
381                 err(EXIT_FAILURE, "pthread_mutexattr_destroy() failed");
382 }
383
384 static __always_inline void mutexattr_settype(pthread_mutexattr_t *attr,
385                                               int type)
386 {
387         errno = pthread_mutexattr_settype(attr, type);
388         if (errno)
389                 err(EXIT_FAILURE, "pthread_mutexattr_settype() failed");
390 }
391
392 static __always_inline void condattr_init(pthread_condattr_t *attr)
393 {
394         errno = pthread_condattr_init(attr);
395         if (errno)
396                 err(EXIT_FAILURE, "pthread_condattr_init() failed");
397 }
398
399 static __always_inline void condattr_destroy(pthread_condattr_t *attr)
400 {
401         errno = pthread_condattr_destroy(attr);
402         if (errno)
403                 err(EXIT_FAILURE, "pthread_condattr_destroy() failed");
404 }
405
406 static __always_inline void condattr_setclock(pthread_condattr_t *attr,
407                                               clockid_t clock_id)
408 {
409         errno = pthread_condattr_setclock(attr, clock_id);
410         if (unlikely(errno))
411                 err(EXIT_FAILURE, "pthread_condattr_setclock() failed");
412 }
413
414 static __always_inline void cond_init(pthread_cond_t *cond,
415                                       const pthread_condattr_t *attr)
416 {
417         errno = pthread_cond_init(cond, attr);
418         if (errno)
419                 err(EXIT_FAILURE, "pthread_cond_init() failed");
420 }
421
422 static __always_inline int
423 cond_timedwait(pthread_cond_t *restrict cond,
424                pthread_mutex_t *restrict mutex,
425                const struct timespec *restrict abstime)
426 {
427         errno = pthread_cond_timedwait(cond, mutex, abstime);
428         if (errno && errno != ETIMEDOUT)
429                 err(EXIT_FAILURE, "pthread_cond_timedwait() failed");
430         return errno;
431 }
432
433 static void init_printstate(void)
434 {
435         pthread_condattr_t cattr;
436
437         printstate.ticket_counter = 0;
438         printstate.ticket_completed = 0;
439         printstate.cnt = 0;
440
441         mutex_init(&printstate.mutex, NULL);
442
443         condattr_init(&cattr);
444         condattr_setclock(&cattr, CLOCK_MONOTONIC);
445         cond_init(&printstate.cond, &cattr);
446         condattr_destroy(&cattr);
447 }
448
449 static void init_print_mtx(void)
450 {
451         pthread_mutexattr_t mattr;
452
453         mutexattr_init(&mattr);
454         mutexattr_settype(&mattr, PTHREAD_MUTEX_RECURSIVE);
455         mutex_init(&print_mtx, &mattr);
456         mutexattr_destroy(&mattr);
457
458 }
459
460 static void signal_blocking(int how)
461 {
462         sigset_t s;
463
464         sigemptyset_or_die(&s);
465         sigaddset_or_die(&s, SIGHUP);
466         sigaddset_or_die(&s, SIGTERM);
467         sigaddset_or_die(&s, SIGINT);
468
469         errno = pthread_sigmask(how, &s, NULL);
470         if (unlikely(errno)) {
471                 warn("pthread_sigmask() failed");
472                 cleanup_exit(EXIT_FAILURE);
473         }
474 }
475
476 static void signal_handler(int num)
477 {
478         signal_flag = num;
479 }
480
481 static void setup_sig_handler(void)
482 {
483         struct sigaction sa;
484
485         memset(&sa, 0, sizeof(sa));
486         sa.sa_handler = signal_handler;
487
488         sigaction_or_die(SIGHUP, &sa, NULL);
489         sigaction_or_die(SIGTERM, &sa, NULL);
490         sigaction_or_die(SIGINT, &sa, NULL);
491 }
492
493 static void process_signal(int signal)
494 {
495         char *name;
496
497         name = strsignal(signal);
498         if (name == NULL)
499                 printf("Received signal %d\n", signal);
500         else
501                 printf("Received signal %d (%s)\n", signal, name);
502         cleanup_exit(EXIT_SUCCESS);
503 }
504
505 static __always_inline void check_signals(void)
506 {
507         int signal = signal_flag;
508
509         if (unlikely(signal))
510                 process_signal(signal);
511 }
512
513 static __always_inline void get_time_in_future(struct timespec *future,
514                                                long time_us)
515 {
516         long nsec;
517
518         clock_gettime_or_die(CLOCK_MONOTONIC, future);
519         future->tv_sec += time_us / USEC_PER_SEC;
520         nsec = future->tv_nsec + (time_us * NSEC_PER_USEC) % NSEC_PER_SEC;
521         if (nsec >= NSEC_PER_SEC) {
522                 future->tv_nsec = nsec % NSEC_PER_SEC;
523                 future->tv_sec += 1;
524         }
525 }
526
527 static __always_inline bool time_has_passed(const struct timespec *time)
528 {
529         struct timespec now;
530
531         clock_gettime_or_die(CLOCK_MONOTONIC, &now);
532         if (now.tv_sec > time->tv_sec)
533                 return true;
534         if (now.tv_sec < time->tv_sec)
535                 return false;
536         return (now.tv_nsec >= time->tv_nsec);
537 }
538
539 static bool mutex_trylock_limit(pthread_mutex_t *mutex, int time_ms)
540 {
541         long time_us = time_ms * USEC_PER_MSEC;
542         struct timespec limit;
543
544         get_time_in_future(&limit, time_us);
545         do {
546                 errno =  pthread_mutex_trylock(mutex);
547                 if (errno && errno != EBUSY)
548                         err(EXIT_FAILURE, "pthread_mutex_trylock() failed");
549         } while (errno && !time_has_passed(&limit));
550         return errno == 0;
551 }
552
553 static void restore_trace_opts(const struct ftrace_state *state,
554                                 const bool *cur)
555 {
556         int i;
557         int r;
558
559         for (i = 0; i < OPTIDX_NR; i++)
560                 if (state->opt_valid[i] && state->opt[i] != cur[i]) {
561                         r = set_trace_opt(optstr[i], state->opt[i]);
562                         if (r < 0)
563                                 warnx("Failed to restore the %s option to %s",
564                                       optstr[i], bool2str(state->opt[i]));
565                         else if (verbose_ftrace())
566                                 printf("Restored the %s option in %s to %s\n",
567                                        optstr[i], TR_OPTIONS,
568                                        bool2str(state->opt[i]));
569                 }
570 }
571
572 static char *read_file(const char *file, enum errhandling h)
573 {
574         int psize;
575         char *r;
576         static const char *emsg = "Failed to read the %s file";
577
578         r = tracefs_instance_file_read(NULL, file, &psize);
579         if (!r) {
580                 if (h) {
581                         warn(emsg, file);
582                         if (h == ERR_CLEANUP)
583                                 cleanup_exit(EXIT_FAILURE);
584                 } else
585                         errx(EXIT_FAILURE, emsg, file);
586         }
587
588         if (r && r[psize - 1] == '\n')
589                 r[psize - 1] = '\0';
590         return r;
591 }
592
593 static void restore_file(const char *file, char **saved, const char *cur)
594 {
595         if (*saved && was_changed(*saved, cur)) {
596                 if (tracefs_instance_file_write(NULL, file, *saved) < 0)
597                         warnx("Failed to restore %s to %s!", file, *saved);
598                 else if (verbose_ftrace())
599                         printf("Restored %s to %s\n", file, *saved);
600                 free(*saved);
601                 *saved = NULL;
602         }
603 }
604
605 static void restore_ftrace(void)
606 {
607         mutex_lock(&save_state.mutex);
608
609         restore_file(TR_CURRENT, &save_state.tracer, current_tracer);
610         restore_file(TR_THRESH, &save_state.thresh, threshold);
611         restore_trace_opts(&save_state, use_options);
612
613         mutex_unlock(&save_state.mutex);
614 }
615
616 static void cleanup_exit(int status)
617 {
618         char *maxlat;
619
620         if (!setup_ftrace)
621                 exit(status);
622
623         /*
624          * We try the print_mtx for 1 sec in order to avoid garbled
625          * output if possible, but if it cannot be obtained we proceed anyway.
626          */
627         mutex_trylock_limit(&print_mtx, TRY_PRINTMUTEX_MS);
628
629         maxlat = read_file(TR_MAXLAT, ERR_WARN);
630         if (maxlat) {
631                 printf("The maximum detected latency was: %sus\n", maxlat);
632                 free(maxlat);
633         }
634
635         restore_ftrace();
636         /*
637          * We do not need to unlock the print_mtx here because we will exit at
638          * the end of this function. Unlocking print_mtx causes problems if a
639          * print thread happens to be waiting for the mutex because we have
640          * just changed the ftrace settings to the original and thus the
641          * print thread would output incorrect data from ftrace.
642          */
643         exit(status);
644 }
645
646 static void init_save_state(void)
647 {
648         pthread_mutexattr_t mattr;
649
650         mutexattr_init(&mattr);
651         mutexattr_settype(&mattr, PTHREAD_MUTEX_RECURSIVE);
652         mutex_init(&save_state.mutex, &mattr);
653         mutexattr_destroy(&mattr);
654
655         save_state.tracer = NULL;
656         save_state.thresh = NULL;
657         save_state.opt_valid[OPTIDX_FUNC_TR] = false;
658         save_state.opt_valid[OPTIDX_DISP_GR] = false;
659 }
660
661 static int printstate_next_ticket(struct entry *req)
662 {
663         int r;
664
665         r = ++(printstate.ticket_counter);
666         req->ticket = r;
667         req->ticket_completed_ref = printstate.ticket_completed;
668         cond_broadcast(&printstate.cond);
669         return r;
670 }
671
672 static __always_inline
673 void printstate_mark_req_completed(const struct entry *req)
674 {
675         if (req->ticket > printstate.ticket_completed)
676                 printstate.ticket_completed = req->ticket;
677 }
678
679 static __always_inline
680 bool printstate_has_new_req_arrived(const struct entry *req)
681 {
682         return (printstate.ticket_counter != req->ticket);
683 }
684
685 static __always_inline int printstate_cnt_inc(void)
686 {
687         int value;
688
689         mutex_lock(&printstate.cnt_mutex);
690         value = ++printstate.cnt;
691         mutex_unlock(&printstate.cnt_mutex);
692         return value;
693 }
694
695 static __always_inline int printstate_cnt_dec(void)
696 {
697         int value;
698
699         mutex_lock(&printstate.cnt_mutex);
700         value = --printstate.cnt;
701         mutex_unlock(&printstate.cnt_mutex);
702         return value;
703 }
704
705 static __always_inline int printstate_cnt_read(void)
706 {
707         int value;
708
709         mutex_lock(&printstate.cnt_mutex);
710         value = printstate.cnt;
711         mutex_unlock(&printstate.cnt_mutex);
712         return value;
713 }
714
715 static __always_inline
716 bool prev_req_won_race(const struct entry *req)
717 {
718         return (printstate.ticket_completed != req->ticket_completed_ref);
719 }
720
721 static void sleeptable_resize(int size, bool printout, struct short_msg *msg)
722 {
723         int bytes;
724
725         if (printout) {
726                 msg->len = 0;
727                 if (unlikely(size > PROB_TABLE_MAX_SIZE))
728                         bytes = snprintf(msg->buf, sizeof(msg->buf),
729 "Cannot increase probability table to %d (maximum size reached)\n", size);
730                 else
731                         bytes = snprintf(msg->buf, sizeof(msg->buf),
732 "Increasing probability table to %d\n", size);
733                 if (bytes < 0)
734                         warn("snprintf() failed");
735                 else
736                         msg->len = bytes;
737         }
738
739         if (unlikely(size < 0)) {
740                 /* Should never happen */
741                 warnx("Bad program state at %s:%d", __FILE__, __LINE__);
742                 cleanup_exit(EXIT_FAILURE);
743                 return;
744         }
745         sleeptable.size = size;
746         sleeptable.table = &probabilities[PROB_TABLE_MAX_SIZE - size];
747 }
748
749 static void init_probabilities(void)
750 {
751         int i;
752         int j = 1000;
753
754         for (i = 0; i < PROB_TABLE_MAX_SIZE; i++) {
755                 probabilities[i] = 1000 / j;
756                 j--;
757         }
758         mutex_init(&sleeptable.mutex, NULL);
759 }
760
761 static int table_get_probability(const struct entry *req,
762                                  struct short_msg *msg)
763 {
764         int diff = req->ticket - req->ticket_completed_ref;
765         int rval = 0;
766
767         msg->len = 0;
768         diff--;
769         /* Should never happen...*/
770         if (unlikely(diff < 0)) {
771                 warnx("Programmer assumption error at %s:%d\n", __FILE__,
772                       __LINE__);
773                 cleanup_exit(EXIT_FAILURE);
774         }
775         mutex_lock(&sleeptable.mutex);
776         if (diff >= (sleeptable.size - 1)) {
777                 rval = sleeptable.table[sleeptable.size - 1];
778                 sleeptable_resize(sleeptable.size + 1, verbose_sizechange(),
779                                   msg);
780         } else {
781                 rval = sleeptable.table[diff];
782         }
783         mutex_unlock(&sleeptable.mutex);
784         return rval;
785 }
786
787 static void init_queue(struct queue *q)
788 {
789         q->next_prod_idx = 0;
790         q->next_cons_idx = 0;
791         mutex_init(&q->mutex, NULL);
792         errno = pthread_cond_init(&q->cond, NULL);
793         if (errno)
794                 err(EXIT_FAILURE, "pthread_cond_init() failed");
795 }
796
797 static __always_inline int queue_len(const struct queue *q)
798 {
799         if (q->next_prod_idx >= q->next_cons_idx)
800                 return q->next_prod_idx - q->next_cons_idx;
801         else
802                 return QUEUE_SIZE - q->next_cons_idx + q->next_prod_idx;
803 }
804
805 static __always_inline int queue_nr_free(const struct queue *q)
806 {
807         int nr_free = QUEUE_SIZE - queue_len(q);
808
809         /*
810          * If there is only one slot left we will anyway lie and claim that the
811          * queue is full because adding an element will make it appear empty
812          */
813         if (nr_free == 1)
814                 nr_free = 0;
815         return nr_free;
816 }
817
818 static __always_inline void queue_idx_inc(int *idx)
819 {
820         *idx = (*idx + 1) % QUEUE_SIZE;
821 }
822
823 static __always_inline void queue_push_to_back(struct queue *q,
824                                               const struct entry *e)
825 {
826         q->entries[q->next_prod_idx] = *e;
827         queue_idx_inc(&q->next_prod_idx);
828 }
829
830 static __always_inline struct entry queue_pop_from_front(struct queue *q)
831 {
832         struct entry e = q->entries[q->next_cons_idx];
833
834         queue_idx_inc(&q->next_cons_idx);
835         return e;
836 }
837
838 static __always_inline void queue_cond_signal(struct queue *q)
839 {
840         cond_signal(&q->cond);
841 }
842
843 static __always_inline void queue_cond_wait(struct queue *q)
844 {
845         cond_wait(&q->cond, &q->mutex);
846 }
847
848 static __always_inline int queue_try_to_add_entry(struct queue *q,
849                                                   const struct entry *e)
850 {
851         int r = 0;
852
853         mutex_lock(&q->mutex);
854         if (queue_nr_free(q) > 0) {
855                 queue_push_to_back(q, e);
856                 cond_signal(&q->cond);
857         } else
858                 r = -1;
859         mutex_unlock(&q->mutex);
860         return r;
861 }
862
863 static struct entry queue_wait_for_entry(struct queue *q)
864 {
865         struct entry e;
866
867         mutex_lock(&q->mutex);
868         while (true) {
869                 if (queue_len(&printqueue) > 0) {
870                         e = queue_pop_from_front(q);
871                         break;
872                 }
873                 queue_cond_wait(q);
874         }
875         mutex_unlock(&q->mutex);
876
877         return e;
878 }
879
880 static const struct policy *policy_from_name(const char *name)
881 {
882         const struct policy *p = &policies[0];
883
884         while (p->name != NULL) {
885                 if (!strcmp(name, p->name))
886                         return p;
887                 p++;
888         }
889         return NULL;
890 }
891
892 static const char *policy_name(int policy)
893 {
894         const struct policy *p = &policies[0];
895         static const char *rval = "unknown";
896
897         while (p->name != NULL) {
898                 if (p->policy == policy)
899                         return p->name;
900                 p++;
901         }
902         return rval;
903 }
904
905 static bool is_relevant_tracer(const char *name)
906 {
907         unsigned int i;
908
909         for (i = 0; relevant_tracers[i]; i++)
910                 if (!strcmp(name, relevant_tracers[i]))
911                         return true;
912         return false;
913 }
914
915 static bool random_makes_sense(const char *name)
916 {
917         unsigned int i;
918
919         for (i = 0; random_tracers[i]; i++)
920                 if (!strcmp(name, random_tracers[i]))
921                         return true;
922         return false;
923 }
924
925 static void show_available(void)
926 {
927         char **tracers;
928         int found = 0;
929         int i;
930
931         tracers = tracefs_tracers(NULL);
932         for (i = 0; tracers && tracers[i]; i++) {
933                 if (is_relevant_tracer(tracers[i]))
934                         found++;
935         }
936
937         if (!tracers) {
938                 warnx(no_tracer_msg);
939                 return;
940         }
941
942         if (!found) {
943                 warnx(no_latency_tr_msg);
944                 tracefs_list_free(tracers);
945                 return;
946         }
947
948         printf("The following latency tracers are available on your system:\n");
949         for (i = 0; tracers[i]; i++) {
950                 if (is_relevant_tracer(tracers[i]))
951                         printf("%s\n", tracers[i]);
952         }
953         tracefs_list_free(tracers);
954 }
955
956 static bool tracer_valid(const char *name, bool *notracer)
957 {
958         char **tracers;
959         int i;
960         bool rval = false;
961
962         *notracer = false;
963         tracers = tracefs_tracers(NULL);
964         if (!tracers) {
965                 *notracer = true;
966                 return false;
967         }
968         for (i = 0; tracers[i]; i++)
969                 if (!strcmp(tracers[i], name)) {
970                         rval = true;
971                         break;
972                 }
973         tracefs_list_free(tracers);
974         return rval;
975 }
976
977 static const char *find_default_tracer(void)
978 {
979         int i;
980         bool notracer;
981         bool valid;
982
983         for (i = 0; relevant_tracers[i]; i++) {
984                 valid = tracer_valid(relevant_tracers[i], &notracer);
985                 if (notracer)
986                         errx(EXIT_FAILURE, no_tracer_msg);
987                 if (valid)
988                         return relevant_tracers[i];
989         }
990         return NULL;
991 }
992
993 static bool toss_coin(struct drand48_data *buffer, unsigned int prob)
994 {
995         long r;
996
997         if (unlikely(lrand48_r(buffer, &r))) {
998                 warnx("lrand48_r() failed");
999                 cleanup_exit(EXIT_FAILURE);
1000         }
1001         r = r % 1000L;
1002         if (r < prob)
1003                 return true;
1004         else
1005                 return false;
1006 }
1007
1008
1009 static long go_to_sleep(const struct entry *req)
1010 {
1011         struct timespec future;
1012         long delay = sleep_time;
1013
1014         get_time_in_future(&future, delay);
1015
1016         mutex_lock(&printstate.mutex);
1017         while (!printstate_has_new_req_arrived(req)) {
1018                 cond_timedwait(&printstate.cond, &printstate.mutex, &future);
1019                 if (time_has_passed(&future))
1020                         break;
1021         };
1022
1023         if (printstate_has_new_req_arrived(req))
1024                 delay = -1;
1025         mutex_unlock(&printstate.mutex);
1026
1027         return delay;
1028 }
1029
1030
1031 static void set_priority(void)
1032 {
1033         int r;
1034         pid_t pid;
1035         struct sched_param param;
1036
1037         memset(&param, 0, sizeof(param));
1038         param.sched_priority = sched_pri;
1039
1040         pid = getpid();
1041         r = sched_setscheduler(pid, sched_policy, &param);
1042
1043         if (r != 0)
1044                 err(EXIT_FAILURE, "sched_setscheduler() failed");
1045 }
1046
1047 pid_t latency_collector_gettid(void)
1048 {
1049         return (pid_t) syscall(__NR_gettid);
1050 }
1051
1052 static void print_priority(void)
1053 {
1054         pid_t tid;
1055         int policy;
1056         int r;
1057         struct sched_param param;
1058
1059         tid = latency_collector_gettid();
1060         r = pthread_getschedparam(pthread_self(), &policy, &param);
1061         if (r != 0) {
1062                 warn("pthread_getschedparam() failed");
1063                 cleanup_exit(EXIT_FAILURE);
1064         }
1065         mutex_lock(&print_mtx);
1066         printf("Thread %d runs with scheduling policy %s and priority %d\n",
1067                tid, policy_name(policy), param.sched_priority);
1068         mutex_unlock(&print_mtx);
1069 }
1070
1071 static __always_inline
1072 void __print_skipmessage(const struct short_msg *resize_msg,
1073                          const struct timespec *timestamp, char *buffer,
1074                          size_t bufspace, const struct entry *req, bool excuse,
1075                          const char *str)
1076 {
1077         ssize_t bytes = 0;
1078         char *p = &buffer[0];
1079         long us, sec;
1080         int r;
1081
1082         sec = timestamp->tv_sec;
1083         us = timestamp->tv_nsec / 1000;
1084
1085         if (resize_msg != NULL && resize_msg->len > 0) {
1086                 strncpy(p, resize_msg->buf, resize_msg->len);
1087                 bytes += resize_msg->len;
1088                 p += resize_msg->len;
1089                 bufspace -= resize_msg->len;
1090         }
1091
1092         if (excuse)
1093                 r = snprintf(p, bufspace,
1094 "%ld.%06ld Latency %d printout skipped due to %s\n",
1095                              sec, us, req->ticket, str);
1096         else
1097                 r = snprintf(p, bufspace, "%ld.%06ld Latency %d detected\n",
1098                             sec, us, req->ticket);
1099
1100         if (r < 0)
1101                 warn("snprintf() failed");
1102         else
1103                 bytes += r;
1104
1105         /* These prints could happen concurrently */
1106         mutex_lock(&print_mtx);
1107         write_or_die(fd_stdout, buffer, bytes);
1108         mutex_unlock(&print_mtx);
1109 }
1110
1111 static void print_skipmessage(const struct short_msg *resize_msg,
1112                               const struct timespec *timestamp, char *buffer,
1113                               size_t bufspace, const struct entry *req,
1114                               bool excuse)
1115 {
1116         __print_skipmessage(resize_msg, timestamp, buffer, bufspace, req,
1117                             excuse, "random delay");
1118 }
1119
1120 static void print_lostmessage(const struct timespec *timestamp, char *buffer,
1121                               size_t bufspace, const struct entry *req,
1122                               const char *reason)
1123 {
1124         __print_skipmessage(NULL, timestamp, buffer, bufspace, req, true,
1125                             reason);
1126 }
1127
1128 static void print_tracefile(const struct short_msg *resize_msg,
1129                             const struct timespec *timestamp, char *buffer,
1130                             size_t bufspace, long slept,
1131                             const struct entry *req)
1132 {
1133         static const int reserve = 256;
1134         char *p = &buffer[0];
1135         ssize_t bytes = 0;
1136         ssize_t bytes_tot = 0;
1137         long us, sec;
1138         long slept_ms;
1139         int trace_fd;
1140
1141         /* Save some space for the final string and final null char */
1142         bufspace = bufspace - reserve - 1;
1143
1144         if (resize_msg != NULL && resize_msg->len > 0) {
1145                 bytes = resize_msg->len;
1146                 strncpy(p, resize_msg->buf, bytes);
1147                 bytes_tot += bytes;
1148                 p += bytes;
1149                 bufspace -= bytes;
1150         }
1151
1152         trace_fd = open(debug_tracefile, O_RDONLY);
1153
1154         if (trace_fd < 0) {
1155                 warn("open() failed on %s", debug_tracefile);
1156                 return;
1157         }
1158
1159         sec = timestamp->tv_sec;
1160         us = timestamp->tv_nsec / 1000;
1161
1162         if (slept != 0) {
1163                 slept_ms = slept / 1000;
1164                 bytes = snprintf(p, bufspace,
1165 "%ld.%06ld Latency %d randomly sleep for %ld ms before print\n",
1166                                  sec, us, req->ticket, slept_ms);
1167         } else {
1168                 bytes = snprintf(p, bufspace,
1169                                  "%ld.%06ld Latency %d immediate print\n", sec,
1170                                  us, req->ticket);
1171         }
1172
1173         if (bytes < 0) {
1174                 warn("snprintf() failed");
1175                 return;
1176         }
1177         p += bytes;
1178         bufspace -= bytes;
1179         bytes_tot += bytes;
1180
1181         bytes = snprintf(p, bufspace,
1182 ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> BEGIN <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n"
1183                 );
1184
1185         if (bytes < 0) {
1186                 warn("snprintf() failed");
1187                 return;
1188         }
1189
1190         p += bytes;
1191         bufspace -= bytes;
1192         bytes_tot += bytes;
1193
1194         do {
1195                 bytes = read(trace_fd, p, bufspace);
1196                 if (bytes < 0) {
1197                         if (errno == EINTR)
1198                                 continue;
1199                         warn("read() failed on %s", debug_tracefile);
1200                         if (unlikely(close(trace_fd) != 0))
1201                                 warn("close() failed on %s", debug_tracefile);
1202                         return;
1203                 }
1204                 if (bytes == 0)
1205                         break;
1206                 p += bytes;
1207                 bufspace -= bytes;
1208                 bytes_tot += bytes;
1209         } while (true);
1210
1211         if (unlikely(close(trace_fd) != 0))
1212                 warn("close() failed on %s", debug_tracefile);
1213
1214         printstate_cnt_dec();
1215         /* Add the reserve space back to the budget for the final string */
1216         bufspace += reserve;
1217
1218         bytes = snprintf(p, bufspace,
1219                          ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> END <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n\n");
1220
1221         if (bytes < 0) {
1222                 warn("snprintf() failed");
1223                 return;
1224         }
1225
1226         bytes_tot += bytes;
1227
1228         /* These prints could happen concurrently */
1229         mutex_lock(&print_mtx);
1230         write_or_die(fd_stdout, buffer, bytes_tot);
1231         mutex_unlock(&print_mtx);
1232 }
1233
1234 static char *get_no_opt(const char *opt)
1235 {
1236         char *no_opt;
1237         int s;
1238
1239         s = strlen(opt) + strlen(OPT_NO_PREFIX) + 1;
1240         /* We may be called from cleanup_exit() via set_trace_opt() */
1241         no_opt = malloc_or_die_nocleanup(s);
1242         strcpy(no_opt, OPT_NO_PREFIX);
1243         strcat(no_opt, opt);
1244         return no_opt;
1245 }
1246
1247 static char *find_next_optstr(const char *allopt, const char **next)
1248 {
1249         const char *begin;
1250         const char *end;
1251         char *r;
1252         int s = 0;
1253
1254         if (allopt == NULL)
1255                 return NULL;
1256
1257         for (begin = allopt; *begin != '\0'; begin++) {
1258                 if (isgraph(*begin))
1259                         break;
1260         }
1261
1262         if (*begin == '\0')
1263                 return NULL;
1264
1265         for (end = begin; *end != '\0' && isgraph(*end); end++)
1266                 s++;
1267
1268         r = malloc_or_die_nocleanup(s + 1);
1269         strncpy(r, begin, s);
1270         r[s] = '\0';
1271         *next = begin + s;
1272         return r;
1273 }
1274
1275 static bool get_trace_opt(const char *allopt, const char *opt, bool *found)
1276 {
1277         *found = false;
1278         char *no_opt;
1279         char *str;
1280         const char *next = allopt;
1281         bool rval = false;
1282
1283         no_opt = get_no_opt(opt);
1284
1285         do {
1286                 str = find_next_optstr(next, &next);
1287                 if (str == NULL)
1288                         break;
1289                 if (!strcmp(str, opt)) {
1290                         *found = true;
1291                         rval = true;
1292                         free(str);
1293                         break;
1294                 }
1295                 if (!strcmp(str, no_opt)) {
1296                         *found = true;
1297                         rval = false;
1298                         free(str);
1299                         break;
1300                 }
1301                 free(str);
1302         } while (true);
1303         free(no_opt);
1304
1305         return rval;
1306 }
1307
1308 static int set_trace_opt(const char *opt, bool value)
1309 {
1310         char *str;
1311         int r;
1312
1313         if (value)
1314                 str = strdup(opt);
1315         else
1316                 str = get_no_opt(opt);
1317
1318         r = tracefs_instance_file_write(NULL, TR_OPTIONS, str);
1319         free(str);
1320         return r;
1321 }
1322
1323 void save_trace_opts(struct ftrace_state *state)
1324 {
1325         char *allopt;
1326         int psize;
1327         int i;
1328
1329         allopt = tracefs_instance_file_read(NULL, TR_OPTIONS, &psize);
1330         if (!allopt)
1331                 errx(EXIT_FAILURE, "Failed to read the %s file\n", TR_OPTIONS);
1332
1333         for (i = 0; i < OPTIDX_NR; i++)
1334                 state->opt[i] = get_trace_opt(allopt, optstr[i],
1335                                               &state->opt_valid[i]);
1336
1337         free(allopt);
1338 }
1339
1340 static void write_file(const char *file, const char *cur, const char *new,
1341                        enum errhandling h)
1342 {
1343         int r;
1344         static const char *emsg = "Failed to write to the %s file!";
1345
1346         /* Do nothing if we now that the current and new value are equal */
1347         if (cur && !needs_change(cur, new))
1348                 return;
1349
1350         r = tracefs_instance_file_write(NULL, file, new);
1351         if (r < 0) {
1352                 if (h) {
1353                         warnx(emsg, file);
1354                         if (h == ERR_CLEANUP)
1355                                 cleanup_exit(EXIT_FAILURE);
1356                 } else
1357                         errx(EXIT_FAILURE, emsg, file);
1358         }
1359         if (verbose_ftrace()) {
1360                 mutex_lock(&print_mtx);
1361                 printf("%s was set to %s\n", file, new);
1362                 mutex_unlock(&print_mtx);
1363         }
1364 }
1365
1366 static void reset_max_latency(void)
1367 {
1368         write_file(TR_MAXLAT, NULL, "0", ERR_CLEANUP);
1369 }
1370
1371 static void save_and_disable_tracer(void)
1372 {
1373         char *orig_th;
1374         char *tracer;
1375         bool need_nop = false;
1376
1377         mutex_lock(&save_state.mutex);
1378
1379         save_trace_opts(&save_state);
1380         tracer = read_file(TR_CURRENT, ERR_EXIT);
1381         orig_th = read_file(TR_THRESH, ERR_EXIT);
1382
1383         if (needs_change(tracer, NOP_TRACER)) {
1384                 mutex_lock(&print_mtx);
1385                 if (force_tracer) {
1386                         printf(
1387                                 "The %s tracer is already in use but proceeding anyway!\n",
1388                                 tracer);
1389                 } else {
1390                         printf(
1391                                 "The %s tracer is already in use, cowardly bailing out!\n"
1392                                 "This could indicate that another program or instance is tracing.\n"
1393                                 "Use the -F [--force] option to disregard the current tracer.\n", tracer);
1394                         exit(0);
1395                 }
1396                 mutex_unlock(&print_mtx);
1397                 need_nop = true;
1398         }
1399
1400         save_state.tracer =  tracer;
1401         save_state.thresh = orig_th;
1402
1403         if (need_nop)
1404                 write_file(TR_CURRENT, NULL, NOP_TRACER, ERR_EXIT);
1405
1406         mutex_unlock(&save_state.mutex);
1407 }
1408
1409 void set_trace_opts(struct ftrace_state *state, bool *new)
1410 {
1411         int i;
1412         int r;
1413
1414         /*
1415          * We only set options if we earlier detected that the option exists in
1416          * the trace_options file and that the wanted setting is different from
1417          * the one we saw in save_and_disable_tracer()
1418          */
1419         for (i = 0; i < OPTIDX_NR; i++)
1420                 if (state->opt_valid[i] &&
1421                     state->opt[i] != new[i]) {
1422                         r = set_trace_opt(optstr[i], new[i]);
1423                         if (r < 0) {
1424                                 warnx("Failed to set the %s option to %s",
1425                                       optstr[i], bool2str(new[i]));
1426                                 cleanup_exit(EXIT_FAILURE);
1427                         }
1428                         if (verbose_ftrace()) {
1429                                 mutex_lock(&print_mtx);
1430                                 printf("%s in %s was set to %s\n", optstr[i],
1431                                        TR_OPTIONS, bool2str(new[i]));
1432                                 mutex_unlock(&print_mtx);
1433                         }
1434                 }
1435 }
1436
1437 static void enable_tracer(void)
1438 {
1439         mutex_lock(&save_state.mutex);
1440         set_trace_opts(&save_state, use_options);
1441
1442         write_file(TR_THRESH, save_state.thresh, threshold, ERR_CLEANUP);
1443         write_file(TR_CURRENT, NOP_TRACER, current_tracer, ERR_CLEANUP);
1444
1445         mutex_unlock(&save_state.mutex);
1446 }
1447
1448 static void tracing_loop(void)
1449 {
1450         int ifd = inotify_init();
1451         int wd;
1452         const ssize_t bufsize = sizeof(inotify_buffer);
1453         const ssize_t istructsize = sizeof(struct inotify_event);
1454         char *buf = &inotify_buffer[0];
1455         ssize_t nr_read;
1456         char *p;
1457         int modified;
1458         struct inotify_event *event;
1459         struct entry req;
1460         char *buffer;
1461         const size_t bufspace = PRINT_BUFFER_SIZE;
1462         struct timespec timestamp;
1463
1464         print_priority();
1465
1466         buffer = malloc_or_die(bufspace);
1467
1468         if (ifd < 0)
1469                 err(EXIT_FAILURE, "inotify_init() failed!");
1470
1471
1472         if (setup_ftrace) {
1473                 /*
1474                  * We must disable the tracer before resetting the max_latency
1475                  */
1476                 save_and_disable_tracer();
1477                 /*
1478                  * We must reset the max_latency before the inotify_add_watch()
1479                  * call.
1480                  */
1481                 reset_max_latency();
1482         }
1483
1484         wd = inotify_add_watch(ifd, debug_maxlat, IN_MODIFY);
1485         if (wd < 0)
1486                 err(EXIT_FAILURE, "inotify_add_watch() failed!");
1487
1488         if (setup_ftrace)
1489                 enable_tracer();
1490
1491         signal_blocking(SIG_UNBLOCK);
1492
1493         while (true) {
1494                 modified = 0;
1495                 check_signals();
1496                 nr_read = read(ifd, buf, bufsize);
1497                 check_signals();
1498                 if (nr_read < 0) {
1499                         if (errno == EINTR)
1500                                 continue;
1501                         warn("read() failed on inotify fd!");
1502                         cleanup_exit(EXIT_FAILURE);
1503                 }
1504                 if (nr_read == bufsize)
1505                         warnx("inotify() buffer filled, skipping events");
1506                 if (nr_read < istructsize) {
1507                         warnx("read() returned too few bytes on inotify fd");
1508                         cleanup_exit(EXIT_FAILURE);
1509                 }
1510
1511                 for (p = buf; p < buf + nr_read;) {
1512                         event = (struct inotify_event *) p;
1513                         if ((event->mask & IN_MODIFY) != 0)
1514                                 modified++;
1515                         p += istructsize + event->len;
1516                 }
1517                 while (modified > 0) {
1518                         check_signals();
1519                         mutex_lock(&printstate.mutex);
1520                         check_signals();
1521                         printstate_next_ticket(&req);
1522                         if (printstate_cnt_read() > 0) {
1523                                 printstate_mark_req_completed(&req);
1524                                 mutex_unlock(&printstate.mutex);
1525                                 if (verbose_lostevent()) {
1526                                         clock_gettime_or_die(CLOCK_MONOTONIC,
1527                                                              &timestamp);
1528                                         print_lostmessage(&timestamp, buffer,
1529                                                           bufspace, &req,
1530                                                           "inotify loop");
1531                                 }
1532                                 break;
1533                         }
1534                         mutex_unlock(&printstate.mutex);
1535                         if (queue_try_to_add_entry(&printqueue, &req) != 0) {
1536                                 /* These prints could happen concurrently */
1537                                 check_signals();
1538                                 mutex_lock(&print_mtx);
1539                                 check_signals();
1540                                 write_or_die(fd_stdout, queue_full_warning,
1541                                              sizeof(queue_full_warning));
1542                                 mutex_unlock(&print_mtx);
1543                         }
1544                         modified--;
1545                 }
1546         }
1547 }
1548
1549 static void *do_printloop(void *arg)
1550 {
1551         const size_t bufspace = PRINT_BUFFER_SIZE;
1552         char *buffer;
1553         long *rseed = (long *) arg;
1554         struct drand48_data drandbuf;
1555         long slept = 0;
1556         struct entry req;
1557         int prob = 0;
1558         struct timespec timestamp;
1559         struct short_msg resize_msg;
1560
1561         print_priority();
1562
1563         if (srand48_r(*rseed, &drandbuf) != 0) {
1564                 warn("srand48_r() failed!\n");
1565                 cleanup_exit(EXIT_FAILURE);
1566         }
1567
1568         buffer = malloc_or_die(bufspace);
1569
1570         while (true) {
1571                 req = queue_wait_for_entry(&printqueue);
1572                 clock_gettime_or_die(CLOCK_MONOTONIC, &timestamp);
1573                 mutex_lock(&printstate.mutex);
1574                 if (prev_req_won_race(&req)) {
1575                         printstate_mark_req_completed(&req);
1576                         mutex_unlock(&printstate.mutex);
1577                         if (verbose_lostevent())
1578                                 print_lostmessage(&timestamp, buffer, bufspace,
1579                                                   &req, "print loop");
1580                         continue;
1581                 }
1582                 mutex_unlock(&printstate.mutex);
1583
1584                 /*
1585                  * Toss a coin to decide if we want to sleep before printing
1586                  * out the backtrace. The reason for this is that opening
1587                  * /sys/kernel/debug/tracing/trace will cause a blackout of
1588                  * hundreds of ms, where no latencies will be noted by the
1589                  * latency tracer. Thus by randomly sleeping we try to avoid
1590                  * missing traces systematically due to this. With this option
1591                  * we will sometimes get the first latency, some other times
1592                  * some of the later ones, in case of closely spaced traces.
1593                  */
1594                 if (trace_enable && use_random_sleep) {
1595                         slept = 0;
1596                         prob = table_get_probability(&req, &resize_msg);
1597                         if (!toss_coin(&drandbuf, prob))
1598                                 slept = go_to_sleep(&req);
1599                         if (slept >= 0) {
1600                                 /* A print is ongoing */
1601                                 printstate_cnt_inc();
1602                                 /*
1603                                  * We will do the printout below so we have to
1604                                  * mark it as completed while we still have the
1605                                  * mutex.
1606                                  */
1607                                 mutex_lock(&printstate.mutex);
1608                                 printstate_mark_req_completed(&req);
1609                                 mutex_unlock(&printstate.mutex);
1610                         }
1611                 }
1612                 if (trace_enable) {
1613                         /*
1614                          * slept < 0  means that we detected another
1615                          * notification in go_to_sleep() above
1616                          */
1617                         if (slept >= 0)
1618                                 /*
1619                                  * N.B. printstate_cnt_dec(); will be called
1620                                  * inside print_tracefile()
1621                                  */
1622                                 print_tracefile(&resize_msg, &timestamp, buffer,
1623                                                 bufspace, slept, &req);
1624                         else
1625                                 print_skipmessage(&resize_msg, &timestamp,
1626                                                   buffer, bufspace, &req, true);
1627                 } else {
1628                         print_skipmessage(&resize_msg, &timestamp, buffer,
1629                                           bufspace, &req, false);
1630                 }
1631         }
1632         return NULL;
1633 }
1634
1635 static void start_printthread(void)
1636 {
1637         unsigned int i;
1638         long *seed;
1639         int ufd;
1640
1641         ufd = open(DEV_URANDOM, O_RDONLY);
1642         if (nr_threads > MAX_THREADS) {
1643                 warnx(
1644 "Number of requested print threads was %d, max number is %d\n",
1645                       nr_threads, MAX_THREADS);
1646                 nr_threads = MAX_THREADS;
1647         }
1648         for (i = 0; i < nr_threads; i++) {
1649                 seed = malloc_or_die(sizeof(*seed));
1650                 if (ufd <  0 ||
1651                     read(ufd, seed, sizeof(*seed)) != sizeof(*seed)) {
1652                         printf(
1653 "Warning! Using trivial random number seed, since %s not available\n",
1654                         DEV_URANDOM);
1655                         fflush(stdout);
1656                         *seed = i;
1657                 }
1658                 errno = pthread_create(&printthread[i], NULL, do_printloop,
1659                                        seed);
1660                 if (errno)
1661                         err(EXIT_FAILURE, "pthread_create()");
1662         }
1663         if (ufd > 0 && close(ufd) != 0)
1664                 warn("close() failed");
1665 }
1666
1667 static void show_usage(void)
1668 {
1669         printf(
1670 "Usage: %s [OPTION]...\n\n"
1671 "Collect closely occurring latencies from %s\n"
1672 "with any of the following tracers: preemptirqsoff, preemptoff, irqsoff, "
1673 "wakeup,\nwakeup_dl, or wakeup_rt.\n\n"
1674
1675 "The occurrence of a latency is detected by monitoring the file\n"
1676 "%s with inotify.\n\n"
1677
1678 "The following options are supported:\n\n"
1679
1680 "-l, --list\t\tList the latency tracers that are supported by the\n"
1681 "\t\t\tcurrently running Linux kernel. If you don't see the\n"
1682 "\t\t\ttracer that you want, you will probably need to\n"
1683 "\t\t\tchange your kernel config and build a new kernel.\n\n"
1684
1685 "-t, --tracer TR\t\tUse the tracer TR. The default is to use the first\n"
1686 "\t\t\ttracer that is supported by the kernel in the following\n"
1687 "\t\t\torder of precedence:\n\n"
1688 "\t\t\tpreemptirqsoff\n"
1689 "\t\t\tpreemptoff\n"
1690 "\t\t\tirqsoff\n"
1691 "\t\t\twakeup\n"
1692 "\t\t\twakeup_rt\n"
1693 "\t\t\twakeup_dl\n"
1694 "\n"
1695 "\t\t\tIf TR is not on the list above, then a warning will be\n"
1696 "\t\t\tprinted.\n\n"
1697
1698 "-F, --force\t\tProceed even if another ftrace tracer is active. Without\n"
1699 "\t\t\tthis option, the program will refuse to start tracing if\n"
1700 "\t\t\tany other tracer than the nop tracer is active.\n\n"
1701
1702 "-s, --threshold TH\tConfigure ftrace to use a threshold of TH microseconds\n"
1703 "\t\t\tfor the tracer. The default is 0, which means that\n"
1704 "\t\t\ttracing_max_latency will be used. tracing_max_latency is\n"
1705 "\t\t\tset to 0 when the program is started and contains the\n"
1706 "\t\t\tmaximum of the latencies that have been encountered.\n\n"
1707
1708 "-f, --function\t\tEnable the function-trace option in trace_options. With\n"
1709 "\t\t\tthis option, ftrace will trace the functions that are\n"
1710 "\t\t\texecuted during a latency, without it we only get the\n"
1711 "\t\t\tbeginning, end, and backtrace.\n\n"
1712
1713 "-g, --graph\t\tEnable the display-graph option in trace_option. This\n"
1714 "\t\t\toption causes ftrace to show the graph of how functions\n"
1715 "\t\t\tare calling other functions.\n\n"
1716
1717 "-c, --policy POL\tRun the program with scheduling policy POL. POL can be\n"
1718 "\t\t\tother, batch, idle, rr or fifo. The default is rr. When\n"
1719 "\t\t\tusing rr or fifo, remember that these policies may cause\n"
1720 "\t\t\tother tasks to experience latencies.\n\n"
1721
1722 "-p, --priority PRI\tRun the program with priority PRI. The acceptable range\n"
1723 "\t\t\tof PRI depends on the scheduling policy.\n\n"
1724
1725 "-n, --notrace\t\tIf latency is detected, do not print out the content of\n"
1726 "\t\t\tthe trace file to standard output\n\n"
1727
1728 "-t, --threads NRTHR\tRun NRTHR threads for printing. Default is %d.\n\n"
1729
1730 "-r, --random\t\tArbitrarily sleep a certain amount of time, default\n"
1731 "\t\t\t%ld ms, before reading the trace file. The\n"
1732 "\t\t\tprobabilities for sleep are chosen so that the\n"
1733 "\t\t\tprobability of obtaining any of a cluster of closely\n"
1734 "\t\t\toccurring latencies are equal, i.e. we will randomly\n"
1735 "\t\t\tchoose which one we collect from the trace file.\n\n"
1736 "\t\t\tThis option is probably only useful with the irqsoff,\n"
1737 "\t\t\tpreemptoff, and preemptirqsoff tracers.\n\n"
1738
1739 "-a, --nrlat NRLAT\tFor the purpose of arbitrary delay, assume that there\n"
1740 "\t\t\tare no more than NRLAT clustered latencies. If NRLAT\n"
1741 "\t\t\tlatencies are detected during a run, this value will\n"
1742 "\t\t\tautomatically be increased to NRLAT + 1 and then to\n"
1743 "\t\t\tNRLAT + 2 and so on. The default is %d. This option\n"
1744 "\t\t\timplies -r. We need to know this number in order to\n"
1745 "\t\t\tbe able to calculate the probabilities of sleeping.\n"
1746 "\t\t\tSpecifically, the probabilities of not sleeping, i.e. to\n"
1747 "\t\t\tdo an immediate printout will be:\n\n"
1748 "\t\t\t1/NRLAT  1/(NRLAT - 1) ... 1/3  1/2  1\n\n"
1749 "\t\t\tThe probability of sleeping will be:\n\n"
1750 "\t\t\t1 - P, where P is from the series above\n\n"
1751 "\t\t\tThis descending probability will cause us to choose\n"
1752 "\t\t\tan occurrence at random. Observe that the final\n"
1753 "\t\t\tprobability is 0, it is when we reach this probability\n"
1754 "\t\t\tthat we increase NRLAT automatically. As an example,\n"
1755 "\t\t\twith the default value of 2, the probabilities will be:\n\n"
1756 "\t\t\t1/2  0\n\n"
1757 "\t\t\tThis means, when a latency is detected we will sleep\n"
1758 "\t\t\twith 50%% probability. If we ever detect another latency\n"
1759 "\t\t\tduring the sleep period, then the probability of sleep\n"
1760 "\t\t\twill be 0%% and the table will be expanded to:\n\n"
1761 "\t\t\t1/3  1/2  0\n\n"
1762
1763 "-v, --verbose\t\tIncrease the verbosity. If this option is given once,\n"
1764 "\t\t\tthen print a message every time that the NRLAT value\n"
1765 "\t\t\tis automatically increased. It also causes a message to\n"
1766 "\t\t\tbe printed when the ftrace settings are changed. If this\n"
1767 "\t\t\toption is given at least twice, then also print a\n"
1768 "\t\t\twarning for lost events.\n\n"
1769
1770 "-u, --time TIME\t\tArbitrarily sleep for a specified time TIME ms before\n"
1771 "\t\t\tprinting out the trace from the trace file. The default\n"
1772 "\t\t\tis %ld ms. This option implies -r.\n\n"
1773
1774 "-x, --no-ftrace\t\tDo not configure ftrace. This assume that the user\n"
1775 "\t\t\tconfigures the ftrace files in sysfs such as\n"
1776 "\t\t\t/sys/kernel/tracing/current_tracer or equivalent.\n\n"
1777
1778 "-i, --tracefile FILE\tUse FILE as trace file. The default is\n"
1779 "\t\t\t%s.\n"
1780 "\t\t\tThis options implies -x\n\n"
1781
1782 "-m, --max-lat FILE\tUse FILE as tracing_max_latency file. The default is\n"
1783 "\t\t\t%s.\n"
1784 "\t\t\tThis options implies -x\n\n"
1785 ,
1786 prg_name, debug_tracefile_dflt, debug_maxlat_dflt, DEFAULT_NR_PRINTER_THREADS,
1787 SLEEP_TIME_MS_DEFAULT, DEFAULT_TABLE_SIZE, SLEEP_TIME_MS_DEFAULT,
1788 debug_tracefile_dflt, debug_maxlat_dflt);
1789 }
1790
1791 static void find_tracefiles(void)
1792 {
1793         debug_tracefile_dflt = tracefs_get_tracing_file("trace");
1794         if (debug_tracefile_dflt == NULL) {
1795                 /* This is needed in show_usage() */
1796                 debug_tracefile_dflt = DEBUG_NOFILE;
1797         }
1798
1799         debug_maxlat_dflt = tracefs_get_tracing_file("tracing_max_latency");
1800         if (debug_maxlat_dflt == NULL) {
1801                 /* This is needed in show_usage() */
1802                 debug_maxlat_dflt = DEBUG_NOFILE;
1803         }
1804
1805         debug_tracefile = debug_tracefile_dflt;
1806         debug_maxlat = debug_maxlat_dflt;
1807 }
1808
1809 bool alldigits(const char *s)
1810 {
1811         for (; *s != '\0'; s++)
1812                 if (!isdigit(*s))
1813                         return false;
1814         return true;
1815 }
1816
1817 void check_alldigits(const char *optarg, const char *argname)
1818 {
1819         if (!alldigits(optarg))
1820                 errx(EXIT_FAILURE,
1821                      "The %s parameter expects a decimal argument\n", argname);
1822 }
1823
1824 static void scan_arguments(int argc, char *argv[])
1825 {
1826         int c;
1827         int i;
1828         int option_idx = 0;
1829
1830         static struct option long_options[] = {
1831                 { "list",       no_argument,            0, 'l' },
1832                 { "tracer",     required_argument,      0, 't' },
1833                 { "force",      no_argument,            0, 'F' },
1834                 { "threshold",  required_argument,      0, 's' },
1835                 { "function",   no_argument,            0, 'f' },
1836                 { "graph",      no_argument,            0, 'g' },
1837                 { "policy",     required_argument,      0, 'c' },
1838                 { "priority",   required_argument,      0, 'p' },
1839                 { "help",       no_argument,            0, 'h' },
1840                 { "notrace",    no_argument,            0, 'n' },
1841                 { "random",     no_argument,            0, 'r' },
1842                 { "nrlat",      required_argument,      0, 'a' },
1843                 { "threads",    required_argument,      0, 'e' },
1844                 { "time",       required_argument,      0, 'u' },
1845                 { "verbose",    no_argument,            0, 'v' },
1846                 { "no-ftrace",  no_argument,            0, 'x' },
1847                 { "tracefile",  required_argument,      0, 'i' },
1848                 { "max-lat",    required_argument,      0, 'm' },
1849                 { 0,            0,                      0,  0  }
1850         };
1851         const struct policy *p;
1852         int max, min;
1853         int value;
1854         bool notracer, valid;
1855
1856         /*
1857          * We must do this before parsing the arguments because show_usage()
1858          * needs to display these.
1859          */
1860         find_tracefiles();
1861
1862         while (true) {
1863                 c = getopt_long(argc, argv, "lt:Fs:fgc:p:hnra:e:u:vxi:m:",
1864                                 long_options, &option_idx);
1865                 if (c == -1)
1866                         break;
1867
1868                 switch (c) {
1869                 case 'l':
1870                         show_available();
1871                         exit(0);
1872                         break;
1873                 case 't':
1874                         current_tracer = strdup(optarg);
1875                         if (!is_relevant_tracer(current_tracer)) {
1876                                 warnx("%s is not a known latency tracer!\n",
1877                                       current_tracer);
1878                         }
1879                         valid = tracer_valid(current_tracer, &notracer);
1880                         if (notracer)
1881                                 errx(EXIT_FAILURE, no_tracer_msg);
1882                         if (!valid)
1883                                 errx(EXIT_FAILURE,
1884 "The tracer %s is not supported by your kernel!\n", current_tracer);
1885                         break;
1886                 case 'F':
1887                         force_tracer = true;
1888                         break;
1889                 case 's':
1890                         check_alldigits(optarg, "-s [--threshold]");
1891                         threshold = strdup(optarg);
1892                         break;
1893                 case 'f':
1894                         use_options[OPTIDX_FUNC_TR] = true;
1895                         break;
1896                 case 'g':
1897                         use_options[OPTIDX_DISP_GR] = true;
1898                         break;
1899                 case 'c':
1900                         p = policy_from_name(optarg);
1901                         if (p != NULL) {
1902                                 sched_policy = p->policy;
1903                                 sched_policy_set = true;
1904                                 if (!sched_pri_set) {
1905                                         sched_pri = p->default_pri;
1906                                         sched_pri_set = true;
1907                                 }
1908                         } else {
1909                                 warnx("Unknown scheduling %s\n", optarg);
1910                                 show_usage();
1911                                 exit(0);
1912                         }
1913                         break;
1914                 case 'p':
1915                         check_alldigits(optarg, "-p [--priority]");
1916                         sched_pri = atoi(optarg);
1917                         sched_pri_set = true;
1918                         break;
1919                 case 'h':
1920                         show_usage();
1921                         exit(0);
1922                         break;
1923                 case 'n':
1924                         trace_enable = false;
1925                         use_random_sleep = false;
1926                         break;
1927                 case 'e':
1928                         check_alldigits(optarg, "-e [--threads]");
1929                         value = atoi(optarg);
1930                         if (value > 0)
1931                                 nr_threads = value;
1932                         else {
1933                                 warnx("NRTHR must be > 0\n");
1934                                 show_usage();
1935                                 exit(0);
1936                         }
1937                         break;
1938                 case 'u':
1939                         check_alldigits(optarg, "-u [--time]");
1940                         value = atoi(optarg);
1941                         if (value < 0) {
1942                                 warnx("TIME must be >= 0\n");
1943                                 show_usage();
1944                                 ;
1945                         }
1946                         trace_enable = true;
1947                         use_random_sleep = true;
1948                         sleep_time = value * USEC_PER_MSEC;
1949                         break;
1950                 case 'v':
1951                         verbosity++;
1952                         break;
1953                 case 'r':
1954                         trace_enable = true;
1955                         use_random_sleep = true;
1956                         break;
1957                 case 'a':
1958                         check_alldigits(optarg, "-a [--nrlat]");
1959                         value = atoi(optarg);
1960                         if (value <= 0) {
1961                                 warnx("NRLAT must be > 0\n");
1962                                 show_usage();
1963                                 exit(0);
1964                         }
1965                         trace_enable = true;
1966                         use_random_sleep = true;
1967                         table_startsize = value;
1968                         break;
1969                 case 'x':
1970                         setup_ftrace = false;
1971                         break;
1972                 case 'i':
1973                         setup_ftrace = false;
1974                         debug_tracefile = strdup(optarg);
1975                         break;
1976                 case 'm':
1977                         setup_ftrace = false;
1978                         debug_maxlat = strdup(optarg);
1979                         break;
1980                 default:
1981                         show_usage();
1982                         exit(0);
1983                         break;
1984                 }
1985         }
1986
1987         if (setup_ftrace) {
1988                 if (!current_tracer) {
1989                         current_tracer = find_default_tracer();
1990                         if (!current_tracer)
1991                                 errx(EXIT_FAILURE,
1992 "No default tracer found and tracer not specified\n");
1993                 }
1994
1995                 if (use_random_sleep && !random_makes_sense(current_tracer)) {
1996                         warnx("WARNING: The tracer is %s and random sleep has",
1997                               current_tracer);
1998                         fprintf(stderr,
1999 "been enabled. Random sleep is intended for the following tracers:\n");
2000                         for (i = 0; random_tracers[i]; i++)
2001                                 fprintf(stderr, "%s\n", random_tracers[i]);
2002                         fprintf(stderr, "\n");
2003                 }
2004         }
2005
2006         if (debug_tracefile == DEBUG_NOFILE ||
2007             debug_maxlat == DEBUG_NOFILE)
2008                 errx(EXIT_FAILURE,
2009 "Could not find tracing directory e.g. /sys/kernel/tracing\n");
2010
2011         if (!sched_policy_set) {
2012                 sched_policy = SCHED_RR;
2013                 sched_policy_set = true;
2014                 if (!sched_pri_set) {
2015                         sched_pri = RT_DEFAULT_PRI;
2016                         sched_pri_set = true;
2017                 }
2018         }
2019
2020         max = sched_get_priority_max(sched_policy);
2021         min = sched_get_priority_min(sched_policy);
2022
2023         if (sched_pri < min) {
2024                 printf(
2025 "ATTENTION: Increasing priority to minimum, which is %d\n", min);
2026                 sched_pri = min;
2027         }
2028         if (sched_pri > max) {
2029                 printf(
2030 "ATTENTION: Reducing priority to maximum, which is %d\n", max);
2031                 sched_pri = max;
2032         }
2033 }
2034
2035 static void show_params(void)
2036 {
2037         printf(
2038 "\n"
2039 "Running with scheduling policy %s and priority %d. Using %d print threads.\n",
2040                 policy_name(sched_policy), sched_pri, nr_threads);
2041         if (trace_enable) {
2042                 if (use_random_sleep) {
2043                         printf(
2044 "%s will be printed with random delay\n"
2045 "Start size of the probability table:\t\t\t%d\n"
2046 "Print a message when the prob. table changes size:\t%s\n"
2047 "Print a warning when an event has been lost:\t\t%s\n"
2048 "Sleep time is:\t\t\t\t\t\t%ld ms\n",
2049 debug_tracefile,
2050 table_startsize,
2051 bool2str(verbose_sizechange()),
2052 bool2str(verbose_lostevent()),
2053 sleep_time / USEC_PER_MSEC);
2054                 } else {
2055                         printf("%s will be printed immediately\n",
2056                                debug_tracefile);
2057                 }
2058         } else {
2059                 printf("%s will not be printed\n",
2060                        debug_tracefile);
2061         }
2062         if (setup_ftrace) {
2063                 printf("Tracer:\t\t\t\t\t\t\t%s\n"
2064                        "%s option:\t\t\t\t\t%s\n"
2065                        "%s option:\t\t\t\t\t%s\n",
2066                        current_tracer,
2067                        optstr[OPTIDX_FUNC_TR],
2068                        bool2str(use_options[OPTIDX_FUNC_TR]),
2069                        optstr[OPTIDX_DISP_GR],
2070                        bool2str(use_options[OPTIDX_DISP_GR]));
2071                 if (!strcmp(threshold, "0"))
2072                         printf("Threshold:\t\t\t\t\t\ttracing_max_latency\n");
2073                 else
2074                         printf("Threshold:\t\t\t\t\t\t%s\n", threshold);
2075         }
2076         printf("\n");
2077 }
2078
2079 int main(int argc, char *argv[])
2080 {
2081         init_save_state();
2082         signal_blocking(SIG_BLOCK);
2083         setup_sig_handler();
2084         open_stdout();
2085
2086         if (argc >= 1)
2087                 prg_name = argv[0];
2088         else
2089                 prg_name = prg_unknown;
2090
2091         scan_arguments(argc, argv);
2092         show_params();
2093
2094         init_printstate();
2095         init_print_mtx();
2096         if (use_random_sleep) {
2097                 init_probabilities();
2098                 if (verbose_sizechange())
2099                         printf("Initializing probability table to %d\n",
2100                                table_startsize);
2101                 sleeptable_resize(table_startsize, false, NULL);
2102         }
2103         set_priority();
2104         init_queue(&printqueue);
2105         start_printthread();
2106         tracing_loop();
2107         return 0;
2108 }