perf thread-stack: Add thread_stack__sample_late()
[linux-2.6-microblaze.git] / tools / perf / util / thread-stack.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * thread-stack.c: Synthesize a thread's stack using call / return events
4  * Copyright (c) 2014, Intel Corporation.
5  */
6
7 #include <linux/rbtree.h>
8 #include <linux/list.h>
9 #include <linux/log2.h>
10 #include <linux/zalloc.h>
11 #include <errno.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include "thread.h"
15 #include "event.h"
16 #include "machine.h"
17 #include "env.h"
18 #include "debug.h"
19 #include "symbol.h"
20 #include "comm.h"
21 #include "call-path.h"
22 #include "thread-stack.h"
23
24 #define STACK_GROWTH 2048
25
26 /*
27  * State of retpoline detection.
28  *
29  * RETPOLINE_NONE: no retpoline detection
30  * X86_RETPOLINE_POSSIBLE: x86 retpoline possible
31  * X86_RETPOLINE_DETECTED: x86 retpoline detected
32  */
33 enum retpoline_state_t {
34         RETPOLINE_NONE,
35         X86_RETPOLINE_POSSIBLE,
36         X86_RETPOLINE_DETECTED,
37 };
38
39 /**
40  * struct thread_stack_entry - thread stack entry.
41  * @ret_addr: return address
42  * @timestamp: timestamp (if known)
43  * @ref: external reference (e.g. db_id of sample)
44  * @branch_count: the branch count when the entry was created
45  * @insn_count: the instruction count when the entry was created
46  * @cyc_count the cycle count when the entry was created
47  * @db_id: id used for db-export
48  * @cp: call path
49  * @no_call: a 'call' was not seen
50  * @trace_end: a 'call' but trace ended
51  * @non_call: a branch but not a 'call' to the start of a different symbol
52  */
53 struct thread_stack_entry {
54         u64 ret_addr;
55         u64 timestamp;
56         u64 ref;
57         u64 branch_count;
58         u64 insn_count;
59         u64 cyc_count;
60         u64 db_id;
61         struct call_path *cp;
62         bool no_call;
63         bool trace_end;
64         bool non_call;
65 };
66
67 /**
68  * struct thread_stack - thread stack constructed from 'call' and 'return'
69  *                       branch samples.
70  * @stack: array that holds the stack
71  * @cnt: number of entries in the stack
72  * @sz: current maximum stack size
73  * @trace_nr: current trace number
74  * @branch_count: running branch count
75  * @insn_count: running  instruction count
76  * @cyc_count running  cycle count
77  * @kernel_start: kernel start address
78  * @last_time: last timestamp
79  * @crp: call/return processor
80  * @comm: current comm
81  * @arr_sz: size of array if this is the first element of an array
82  * @rstate: used to detect retpolines
83  */
84 struct thread_stack {
85         struct thread_stack_entry *stack;
86         size_t cnt;
87         size_t sz;
88         u64 trace_nr;
89         u64 branch_count;
90         u64 insn_count;
91         u64 cyc_count;
92         u64 kernel_start;
93         u64 last_time;
94         struct call_return_processor *crp;
95         struct comm *comm;
96         unsigned int arr_sz;
97         enum retpoline_state_t rstate;
98 };
99
100 /*
101  * Assume pid == tid == 0 identifies the idle task as defined by
102  * perf_session__register_idle_thread(). The idle task is really 1 task per cpu,
103  * and therefore requires a stack for each cpu.
104  */
105 static inline bool thread_stack__per_cpu(struct thread *thread)
106 {
107         return !(thread->tid || thread->pid_);
108 }
109
110 static int thread_stack__grow(struct thread_stack *ts)
111 {
112         struct thread_stack_entry *new_stack;
113         size_t sz, new_sz;
114
115         new_sz = ts->sz + STACK_GROWTH;
116         sz = new_sz * sizeof(struct thread_stack_entry);
117
118         new_stack = realloc(ts->stack, sz);
119         if (!new_stack)
120                 return -ENOMEM;
121
122         ts->stack = new_stack;
123         ts->sz = new_sz;
124
125         return 0;
126 }
127
128 static int thread_stack__init(struct thread_stack *ts, struct thread *thread,
129                               struct call_return_processor *crp)
130 {
131         int err;
132
133         err = thread_stack__grow(ts);
134         if (err)
135                 return err;
136
137         if (thread->maps && thread->maps->machine) {
138                 struct machine *machine = thread->maps->machine;
139                 const char *arch = perf_env__arch(machine->env);
140
141                 ts->kernel_start = machine__kernel_start(machine);
142                 if (!strcmp(arch, "x86"))
143                         ts->rstate = X86_RETPOLINE_POSSIBLE;
144         } else {
145                 ts->kernel_start = 1ULL << 63;
146         }
147         ts->crp = crp;
148
149         return 0;
150 }
151
152 static struct thread_stack *thread_stack__new(struct thread *thread, int cpu,
153                                               struct call_return_processor *crp)
154 {
155         struct thread_stack *ts = thread->ts, *new_ts;
156         unsigned int old_sz = ts ? ts->arr_sz : 0;
157         unsigned int new_sz = 1;
158
159         if (thread_stack__per_cpu(thread) && cpu > 0)
160                 new_sz = roundup_pow_of_two(cpu + 1);
161
162         if (!ts || new_sz > old_sz) {
163                 new_ts = calloc(new_sz, sizeof(*ts));
164                 if (!new_ts)
165                         return NULL;
166                 if (ts)
167                         memcpy(new_ts, ts, old_sz * sizeof(*ts));
168                 new_ts->arr_sz = new_sz;
169                 zfree(&thread->ts);
170                 thread->ts = new_ts;
171                 ts = new_ts;
172         }
173
174         if (thread_stack__per_cpu(thread) && cpu > 0 &&
175             (unsigned int)cpu < ts->arr_sz)
176                 ts += cpu;
177
178         if (!ts->stack &&
179             thread_stack__init(ts, thread, crp))
180                 return NULL;
181
182         return ts;
183 }
184
185 static struct thread_stack *thread__cpu_stack(struct thread *thread, int cpu)
186 {
187         struct thread_stack *ts = thread->ts;
188
189         if (cpu < 0)
190                 cpu = 0;
191
192         if (!ts || (unsigned int)cpu >= ts->arr_sz)
193                 return NULL;
194
195         ts += cpu;
196
197         if (!ts->stack)
198                 return NULL;
199
200         return ts;
201 }
202
203 static inline struct thread_stack *thread__stack(struct thread *thread,
204                                                     int cpu)
205 {
206         if (!thread)
207                 return NULL;
208
209         if (thread_stack__per_cpu(thread))
210                 return thread__cpu_stack(thread, cpu);
211
212         return thread->ts;
213 }
214
215 static int thread_stack__push(struct thread_stack *ts, u64 ret_addr,
216                               bool trace_end)
217 {
218         int err = 0;
219
220         if (ts->cnt == ts->sz) {
221                 err = thread_stack__grow(ts);
222                 if (err) {
223                         pr_warning("Out of memory: discarding thread stack\n");
224                         ts->cnt = 0;
225                 }
226         }
227
228         ts->stack[ts->cnt].trace_end = trace_end;
229         ts->stack[ts->cnt++].ret_addr = ret_addr;
230
231         return err;
232 }
233
234 static void thread_stack__pop(struct thread_stack *ts, u64 ret_addr)
235 {
236         size_t i;
237
238         /*
239          * In some cases there may be functions which are not seen to return.
240          * For example when setjmp / longjmp has been used.  Or the perf context
241          * switch in the kernel which doesn't stop and start tracing in exactly
242          * the same code path.  When that happens the return address will be
243          * further down the stack.  If the return address is not found at all,
244          * we assume the opposite (i.e. this is a return for a call that wasn't
245          * seen for some reason) and leave the stack alone.
246          */
247         for (i = ts->cnt; i; ) {
248                 if (ts->stack[--i].ret_addr == ret_addr) {
249                         ts->cnt = i;
250                         return;
251                 }
252         }
253 }
254
255 static void thread_stack__pop_trace_end(struct thread_stack *ts)
256 {
257         size_t i;
258
259         for (i = ts->cnt; i; ) {
260                 if (ts->stack[--i].trace_end)
261                         ts->cnt = i;
262                 else
263                         return;
264         }
265 }
266
267 static bool thread_stack__in_kernel(struct thread_stack *ts)
268 {
269         if (!ts->cnt)
270                 return false;
271
272         return ts->stack[ts->cnt - 1].cp->in_kernel;
273 }
274
275 static int thread_stack__call_return(struct thread *thread,
276                                      struct thread_stack *ts, size_t idx,
277                                      u64 timestamp, u64 ref, bool no_return)
278 {
279         struct call_return_processor *crp = ts->crp;
280         struct thread_stack_entry *tse;
281         struct call_return cr = {
282                 .thread = thread,
283                 .comm = ts->comm,
284                 .db_id = 0,
285         };
286         u64 *parent_db_id;
287
288         tse = &ts->stack[idx];
289         cr.cp = tse->cp;
290         cr.call_time = tse->timestamp;
291         cr.return_time = timestamp;
292         cr.branch_count = ts->branch_count - tse->branch_count;
293         cr.insn_count = ts->insn_count - tse->insn_count;
294         cr.cyc_count = ts->cyc_count - tse->cyc_count;
295         cr.db_id = tse->db_id;
296         cr.call_ref = tse->ref;
297         cr.return_ref = ref;
298         if (tse->no_call)
299                 cr.flags |= CALL_RETURN_NO_CALL;
300         if (no_return)
301                 cr.flags |= CALL_RETURN_NO_RETURN;
302         if (tse->non_call)
303                 cr.flags |= CALL_RETURN_NON_CALL;
304
305         /*
306          * The parent db_id must be assigned before exporting the child. Note
307          * it is not possible to export the parent first because its information
308          * is not yet complete because its 'return' has not yet been processed.
309          */
310         parent_db_id = idx ? &(tse - 1)->db_id : NULL;
311
312         return crp->process(&cr, parent_db_id, crp->data);
313 }
314
315 static int __thread_stack__flush(struct thread *thread, struct thread_stack *ts)
316 {
317         struct call_return_processor *crp = ts->crp;
318         int err;
319
320         if (!crp) {
321                 ts->cnt = 0;
322                 return 0;
323         }
324
325         while (ts->cnt) {
326                 err = thread_stack__call_return(thread, ts, --ts->cnt,
327                                                 ts->last_time, 0, true);
328                 if (err) {
329                         pr_err("Error flushing thread stack!\n");
330                         ts->cnt = 0;
331                         return err;
332                 }
333         }
334
335         return 0;
336 }
337
338 int thread_stack__flush(struct thread *thread)
339 {
340         struct thread_stack *ts = thread->ts;
341         unsigned int pos;
342         int err = 0;
343
344         if (ts) {
345                 for (pos = 0; pos < ts->arr_sz; pos++) {
346                         int ret = __thread_stack__flush(thread, ts + pos);
347
348                         if (ret)
349                                 err = ret;
350                 }
351         }
352
353         return err;
354 }
355
356 int thread_stack__event(struct thread *thread, int cpu, u32 flags, u64 from_ip,
357                         u64 to_ip, u16 insn_len, u64 trace_nr)
358 {
359         struct thread_stack *ts = thread__stack(thread, cpu);
360
361         if (!thread)
362                 return -EINVAL;
363
364         if (!ts) {
365                 ts = thread_stack__new(thread, cpu, NULL);
366                 if (!ts) {
367                         pr_warning("Out of memory: no thread stack\n");
368                         return -ENOMEM;
369                 }
370                 ts->trace_nr = trace_nr;
371         }
372
373         /*
374          * When the trace is discontinuous, the trace_nr changes.  In that case
375          * the stack might be completely invalid.  Better to report nothing than
376          * to report something misleading, so flush the stack.
377          */
378         if (trace_nr != ts->trace_nr) {
379                 if (ts->trace_nr)
380                         __thread_stack__flush(thread, ts);
381                 ts->trace_nr = trace_nr;
382         }
383
384         /* Stop here if thread_stack__process() is in use */
385         if (ts->crp)
386                 return 0;
387
388         if (flags & PERF_IP_FLAG_CALL) {
389                 u64 ret_addr;
390
391                 if (!to_ip)
392                         return 0;
393                 ret_addr = from_ip + insn_len;
394                 if (ret_addr == to_ip)
395                         return 0; /* Zero-length calls are excluded */
396                 return thread_stack__push(ts, ret_addr,
397                                           flags & PERF_IP_FLAG_TRACE_END);
398         } else if (flags & PERF_IP_FLAG_TRACE_BEGIN) {
399                 /*
400                  * If the caller did not change the trace number (which would
401                  * have flushed the stack) then try to make sense of the stack.
402                  * Possibly, tracing began after returning to the current
403                  * address, so try to pop that. Also, do not expect a call made
404                  * when the trace ended, to return, so pop that.
405                  */
406                 thread_stack__pop(ts, to_ip);
407                 thread_stack__pop_trace_end(ts);
408         } else if ((flags & PERF_IP_FLAG_RETURN) && from_ip) {
409                 thread_stack__pop(ts, to_ip);
410         }
411
412         return 0;
413 }
414
415 void thread_stack__set_trace_nr(struct thread *thread, int cpu, u64 trace_nr)
416 {
417         struct thread_stack *ts = thread__stack(thread, cpu);
418
419         if (!ts)
420                 return;
421
422         if (trace_nr != ts->trace_nr) {
423                 if (ts->trace_nr)
424                         __thread_stack__flush(thread, ts);
425                 ts->trace_nr = trace_nr;
426         }
427 }
428
429 static void __thread_stack__free(struct thread *thread, struct thread_stack *ts)
430 {
431         __thread_stack__flush(thread, ts);
432         zfree(&ts->stack);
433 }
434
435 static void thread_stack__reset(struct thread *thread, struct thread_stack *ts)
436 {
437         unsigned int arr_sz = ts->arr_sz;
438
439         __thread_stack__free(thread, ts);
440         memset(ts, 0, sizeof(*ts));
441         ts->arr_sz = arr_sz;
442 }
443
444 void thread_stack__free(struct thread *thread)
445 {
446         struct thread_stack *ts = thread->ts;
447         unsigned int pos;
448
449         if (ts) {
450                 for (pos = 0; pos < ts->arr_sz; pos++)
451                         __thread_stack__free(thread, ts + pos);
452                 zfree(&thread->ts);
453         }
454 }
455
456 static inline u64 callchain_context(u64 ip, u64 kernel_start)
457 {
458         return ip < kernel_start ? PERF_CONTEXT_USER : PERF_CONTEXT_KERNEL;
459 }
460
461 void thread_stack__sample(struct thread *thread, int cpu,
462                           struct ip_callchain *chain,
463                           size_t sz, u64 ip, u64 kernel_start)
464 {
465         struct thread_stack *ts = thread__stack(thread, cpu);
466         u64 context = callchain_context(ip, kernel_start);
467         u64 last_context;
468         size_t i, j;
469
470         if (sz < 2) {
471                 chain->nr = 0;
472                 return;
473         }
474
475         chain->ips[0] = context;
476         chain->ips[1] = ip;
477
478         if (!ts) {
479                 chain->nr = 2;
480                 return;
481         }
482
483         last_context = context;
484
485         for (i = 2, j = 1; i < sz && j <= ts->cnt; i++, j++) {
486                 ip = ts->stack[ts->cnt - j].ret_addr;
487                 context = callchain_context(ip, kernel_start);
488                 if (context != last_context) {
489                         if (i >= sz - 1)
490                                 break;
491                         chain->ips[i++] = context;
492                         last_context = context;
493                 }
494                 chain->ips[i] = ip;
495         }
496
497         chain->nr = i;
498 }
499
500 /*
501  * Hardware sample records, created some time after the event occurred, need to
502  * have subsequent addresses removed from the call chain.
503  */
504 void thread_stack__sample_late(struct thread *thread, int cpu,
505                                struct ip_callchain *chain, size_t sz,
506                                u64 sample_ip, u64 kernel_start)
507 {
508         struct thread_stack *ts = thread__stack(thread, cpu);
509         u64 sample_context = callchain_context(sample_ip, kernel_start);
510         u64 last_context, context, ip;
511         size_t nr = 0, j;
512
513         if (sz < 2) {
514                 chain->nr = 0;
515                 return;
516         }
517
518         if (!ts)
519                 goto out;
520
521         /*
522          * When tracing kernel space, kernel addresses occur at the top of the
523          * call chain after the event occurred but before tracing stopped.
524          * Skip them.
525          */
526         for (j = 1; j <= ts->cnt; j++) {
527                 ip = ts->stack[ts->cnt - j].ret_addr;
528                 context = callchain_context(ip, kernel_start);
529                 if (context == PERF_CONTEXT_USER ||
530                     (context == sample_context && ip == sample_ip))
531                         break;
532         }
533
534         last_context = sample_ip; /* Use sample_ip as an invalid context */
535
536         for (; nr < sz && j <= ts->cnt; nr++, j++) {
537                 ip = ts->stack[ts->cnt - j].ret_addr;
538                 context = callchain_context(ip, kernel_start);
539                 if (context != last_context) {
540                         if (nr >= sz - 1)
541                                 break;
542                         chain->ips[nr++] = context;
543                         last_context = context;
544                 }
545                 chain->ips[nr] = ip;
546         }
547 out:
548         if (nr) {
549                 chain->nr = nr;
550         } else {
551                 chain->ips[0] = sample_context;
552                 chain->ips[1] = sample_ip;
553                 chain->nr = 2;
554         }
555 }
556
557 struct call_return_processor *
558 call_return_processor__new(int (*process)(struct call_return *cr, u64 *parent_db_id, void *data),
559                            void *data)
560 {
561         struct call_return_processor *crp;
562
563         crp = zalloc(sizeof(struct call_return_processor));
564         if (!crp)
565                 return NULL;
566         crp->cpr = call_path_root__new();
567         if (!crp->cpr)
568                 goto out_free;
569         crp->process = process;
570         crp->data = data;
571         return crp;
572
573 out_free:
574         free(crp);
575         return NULL;
576 }
577
578 void call_return_processor__free(struct call_return_processor *crp)
579 {
580         if (crp) {
581                 call_path_root__free(crp->cpr);
582                 free(crp);
583         }
584 }
585
586 static int thread_stack__push_cp(struct thread_stack *ts, u64 ret_addr,
587                                  u64 timestamp, u64 ref, struct call_path *cp,
588                                  bool no_call, bool trace_end)
589 {
590         struct thread_stack_entry *tse;
591         int err;
592
593         if (!cp)
594                 return -ENOMEM;
595
596         if (ts->cnt == ts->sz) {
597                 err = thread_stack__grow(ts);
598                 if (err)
599                         return err;
600         }
601
602         tse = &ts->stack[ts->cnt++];
603         tse->ret_addr = ret_addr;
604         tse->timestamp = timestamp;
605         tse->ref = ref;
606         tse->branch_count = ts->branch_count;
607         tse->insn_count = ts->insn_count;
608         tse->cyc_count = ts->cyc_count;
609         tse->cp = cp;
610         tse->no_call = no_call;
611         tse->trace_end = trace_end;
612         tse->non_call = false;
613         tse->db_id = 0;
614
615         return 0;
616 }
617
618 static int thread_stack__pop_cp(struct thread *thread, struct thread_stack *ts,
619                                 u64 ret_addr, u64 timestamp, u64 ref,
620                                 struct symbol *sym)
621 {
622         int err;
623
624         if (!ts->cnt)
625                 return 1;
626
627         if (ts->cnt == 1) {
628                 struct thread_stack_entry *tse = &ts->stack[0];
629
630                 if (tse->cp->sym == sym)
631                         return thread_stack__call_return(thread, ts, --ts->cnt,
632                                                          timestamp, ref, false);
633         }
634
635         if (ts->stack[ts->cnt - 1].ret_addr == ret_addr &&
636             !ts->stack[ts->cnt - 1].non_call) {
637                 return thread_stack__call_return(thread, ts, --ts->cnt,
638                                                  timestamp, ref, false);
639         } else {
640                 size_t i = ts->cnt - 1;
641
642                 while (i--) {
643                         if (ts->stack[i].ret_addr != ret_addr ||
644                             ts->stack[i].non_call)
645                                 continue;
646                         i += 1;
647                         while (ts->cnt > i) {
648                                 err = thread_stack__call_return(thread, ts,
649                                                                 --ts->cnt,
650                                                                 timestamp, ref,
651                                                                 true);
652                                 if (err)
653                                         return err;
654                         }
655                         return thread_stack__call_return(thread, ts, --ts->cnt,
656                                                          timestamp, ref, false);
657                 }
658         }
659
660         return 1;
661 }
662
663 static int thread_stack__bottom(struct thread_stack *ts,
664                                 struct perf_sample *sample,
665                                 struct addr_location *from_al,
666                                 struct addr_location *to_al, u64 ref)
667 {
668         struct call_path_root *cpr = ts->crp->cpr;
669         struct call_path *cp;
670         struct symbol *sym;
671         u64 ip;
672
673         if (sample->ip) {
674                 ip = sample->ip;
675                 sym = from_al->sym;
676         } else if (sample->addr) {
677                 ip = sample->addr;
678                 sym = to_al->sym;
679         } else {
680                 return 0;
681         }
682
683         cp = call_path__findnew(cpr, &cpr->call_path, sym, ip,
684                                 ts->kernel_start);
685
686         return thread_stack__push_cp(ts, ip, sample->time, ref, cp,
687                                      true, false);
688 }
689
690 static int thread_stack__pop_ks(struct thread *thread, struct thread_stack *ts,
691                                 struct perf_sample *sample, u64 ref)
692 {
693         u64 tm = sample->time;
694         int err;
695
696         /* Return to userspace, so pop all kernel addresses */
697         while (thread_stack__in_kernel(ts)) {
698                 err = thread_stack__call_return(thread, ts, --ts->cnt,
699                                                 tm, ref, true);
700                 if (err)
701                         return err;
702         }
703
704         return 0;
705 }
706
707 static int thread_stack__no_call_return(struct thread *thread,
708                                         struct thread_stack *ts,
709                                         struct perf_sample *sample,
710                                         struct addr_location *from_al,
711                                         struct addr_location *to_al, u64 ref)
712 {
713         struct call_path_root *cpr = ts->crp->cpr;
714         struct call_path *root = &cpr->call_path;
715         struct symbol *fsym = from_al->sym;
716         struct symbol *tsym = to_al->sym;
717         struct call_path *cp, *parent;
718         u64 ks = ts->kernel_start;
719         u64 addr = sample->addr;
720         u64 tm = sample->time;
721         u64 ip = sample->ip;
722         int err;
723
724         if (ip >= ks && addr < ks) {
725                 /* Return to userspace, so pop all kernel addresses */
726                 err = thread_stack__pop_ks(thread, ts, sample, ref);
727                 if (err)
728                         return err;
729
730                 /* If the stack is empty, push the userspace address */
731                 if (!ts->cnt) {
732                         cp = call_path__findnew(cpr, root, tsym, addr, ks);
733                         return thread_stack__push_cp(ts, 0, tm, ref, cp, true,
734                                                      false);
735                 }
736         } else if (thread_stack__in_kernel(ts) && ip < ks) {
737                 /* Return to userspace, so pop all kernel addresses */
738                 err = thread_stack__pop_ks(thread, ts, sample, ref);
739                 if (err)
740                         return err;
741         }
742
743         if (ts->cnt)
744                 parent = ts->stack[ts->cnt - 1].cp;
745         else
746                 parent = root;
747
748         if (parent->sym == from_al->sym) {
749                 /*
750                  * At the bottom of the stack, assume the missing 'call' was
751                  * before the trace started. So, pop the current symbol and push
752                  * the 'to' symbol.
753                  */
754                 if (ts->cnt == 1) {
755                         err = thread_stack__call_return(thread, ts, --ts->cnt,
756                                                         tm, ref, false);
757                         if (err)
758                                 return err;
759                 }
760
761                 if (!ts->cnt) {
762                         cp = call_path__findnew(cpr, root, tsym, addr, ks);
763
764                         return thread_stack__push_cp(ts, addr, tm, ref, cp,
765                                                      true, false);
766                 }
767
768                 /*
769                  * Otherwise assume the 'return' is being used as a jump (e.g.
770                  * retpoline) and just push the 'to' symbol.
771                  */
772                 cp = call_path__findnew(cpr, parent, tsym, addr, ks);
773
774                 err = thread_stack__push_cp(ts, 0, tm, ref, cp, true, false);
775                 if (!err)
776                         ts->stack[ts->cnt - 1].non_call = true;
777
778                 return err;
779         }
780
781         /*
782          * Assume 'parent' has not yet returned, so push 'to', and then push and
783          * pop 'from'.
784          */
785
786         cp = call_path__findnew(cpr, parent, tsym, addr, ks);
787
788         err = thread_stack__push_cp(ts, addr, tm, ref, cp, true, false);
789         if (err)
790                 return err;
791
792         cp = call_path__findnew(cpr, cp, fsym, ip, ks);
793
794         err = thread_stack__push_cp(ts, ip, tm, ref, cp, true, false);
795         if (err)
796                 return err;
797
798         return thread_stack__call_return(thread, ts, --ts->cnt, tm, ref, false);
799 }
800
801 static int thread_stack__trace_begin(struct thread *thread,
802                                      struct thread_stack *ts, u64 timestamp,
803                                      u64 ref)
804 {
805         struct thread_stack_entry *tse;
806         int err;
807
808         if (!ts->cnt)
809                 return 0;
810
811         /* Pop trace end */
812         tse = &ts->stack[ts->cnt - 1];
813         if (tse->trace_end) {
814                 err = thread_stack__call_return(thread, ts, --ts->cnt,
815                                                 timestamp, ref, false);
816                 if (err)
817                         return err;
818         }
819
820         return 0;
821 }
822
823 static int thread_stack__trace_end(struct thread_stack *ts,
824                                    struct perf_sample *sample, u64 ref)
825 {
826         struct call_path_root *cpr = ts->crp->cpr;
827         struct call_path *cp;
828         u64 ret_addr;
829
830         /* No point having 'trace end' on the bottom of the stack */
831         if (!ts->cnt || (ts->cnt == 1 && ts->stack[0].ref == ref))
832                 return 0;
833
834         cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, NULL, 0,
835                                 ts->kernel_start);
836
837         ret_addr = sample->ip + sample->insn_len;
838
839         return thread_stack__push_cp(ts, ret_addr, sample->time, ref, cp,
840                                      false, true);
841 }
842
843 static bool is_x86_retpoline(const char *name)
844 {
845         const char *p = strstr(name, "__x86_indirect_thunk_");
846
847         return p == name || !strcmp(name, "__indirect_thunk_start");
848 }
849
850 /*
851  * x86 retpoline functions pollute the call graph. This function removes them.
852  * This does not handle function return thunks, nor is there any improvement
853  * for the handling of inline thunks or extern thunks.
854  */
855 static int thread_stack__x86_retpoline(struct thread_stack *ts,
856                                        struct perf_sample *sample,
857                                        struct addr_location *to_al)
858 {
859         struct thread_stack_entry *tse = &ts->stack[ts->cnt - 1];
860         struct call_path_root *cpr = ts->crp->cpr;
861         struct symbol *sym = tse->cp->sym;
862         struct symbol *tsym = to_al->sym;
863         struct call_path *cp;
864
865         if (sym && is_x86_retpoline(sym->name)) {
866                 /*
867                  * This is a x86 retpoline fn. It pollutes the call graph by
868                  * showing up everywhere there is an indirect branch, but does
869                  * not itself mean anything. Here the top-of-stack is removed,
870                  * by decrementing the stack count, and then further down, the
871                  * resulting top-of-stack is replaced with the actual target.
872                  * The result is that the retpoline functions will no longer
873                  * appear in the call graph. Note this only affects the call
874                  * graph, since all the original branches are left unchanged.
875                  */
876                 ts->cnt -= 1;
877                 sym = ts->stack[ts->cnt - 2].cp->sym;
878                 if (sym && sym == tsym && to_al->addr != tsym->start) {
879                         /*
880                          * Target is back to the middle of the symbol we came
881                          * from so assume it is an indirect jmp and forget it
882                          * altogether.
883                          */
884                         ts->cnt -= 1;
885                         return 0;
886                 }
887         } else if (sym && sym == tsym) {
888                 /*
889                  * Target is back to the symbol we came from so assume it is an
890                  * indirect jmp and forget it altogether.
891                  */
892                 ts->cnt -= 1;
893                 return 0;
894         }
895
896         cp = call_path__findnew(cpr, ts->stack[ts->cnt - 2].cp, tsym,
897                                 sample->addr, ts->kernel_start);
898         if (!cp)
899                 return -ENOMEM;
900
901         /* Replace the top-of-stack with the actual target */
902         ts->stack[ts->cnt - 1].cp = cp;
903
904         return 0;
905 }
906
907 int thread_stack__process(struct thread *thread, struct comm *comm,
908                           struct perf_sample *sample,
909                           struct addr_location *from_al,
910                           struct addr_location *to_al, u64 ref,
911                           struct call_return_processor *crp)
912 {
913         struct thread_stack *ts = thread__stack(thread, sample->cpu);
914         enum retpoline_state_t rstate;
915         int err = 0;
916
917         if (ts && !ts->crp) {
918                 /* Supersede thread_stack__event() */
919                 thread_stack__reset(thread, ts);
920                 ts = NULL;
921         }
922
923         if (!ts) {
924                 ts = thread_stack__new(thread, sample->cpu, crp);
925                 if (!ts)
926                         return -ENOMEM;
927                 ts->comm = comm;
928         }
929
930         rstate = ts->rstate;
931         if (rstate == X86_RETPOLINE_DETECTED)
932                 ts->rstate = X86_RETPOLINE_POSSIBLE;
933
934         /* Flush stack on exec */
935         if (ts->comm != comm && thread->pid_ == thread->tid) {
936                 err = __thread_stack__flush(thread, ts);
937                 if (err)
938                         return err;
939                 ts->comm = comm;
940         }
941
942         /* If the stack is empty, put the current symbol on the stack */
943         if (!ts->cnt) {
944                 err = thread_stack__bottom(ts, sample, from_al, to_al, ref);
945                 if (err)
946                         return err;
947         }
948
949         ts->branch_count += 1;
950         ts->insn_count += sample->insn_cnt;
951         ts->cyc_count += sample->cyc_cnt;
952         ts->last_time = sample->time;
953
954         if (sample->flags & PERF_IP_FLAG_CALL) {
955                 bool trace_end = sample->flags & PERF_IP_FLAG_TRACE_END;
956                 struct call_path_root *cpr = ts->crp->cpr;
957                 struct call_path *cp;
958                 u64 ret_addr;
959
960                 if (!sample->ip || !sample->addr)
961                         return 0;
962
963                 ret_addr = sample->ip + sample->insn_len;
964                 if (ret_addr == sample->addr)
965                         return 0; /* Zero-length calls are excluded */
966
967                 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp,
968                                         to_al->sym, sample->addr,
969                                         ts->kernel_start);
970                 err = thread_stack__push_cp(ts, ret_addr, sample->time, ref,
971                                             cp, false, trace_end);
972
973                 /*
974                  * A call to the same symbol but not the start of the symbol,
975                  * may be the start of a x86 retpoline.
976                  */
977                 if (!err && rstate == X86_RETPOLINE_POSSIBLE && to_al->sym &&
978                     from_al->sym == to_al->sym &&
979                     to_al->addr != to_al->sym->start)
980                         ts->rstate = X86_RETPOLINE_DETECTED;
981
982         } else if (sample->flags & PERF_IP_FLAG_RETURN) {
983                 if (!sample->addr) {
984                         u32 return_from_kernel = PERF_IP_FLAG_SYSCALLRET |
985                                                  PERF_IP_FLAG_INTERRUPT;
986
987                         if (!(sample->flags & return_from_kernel))
988                                 return 0;
989
990                         /* Pop kernel stack */
991                         return thread_stack__pop_ks(thread, ts, sample, ref);
992                 }
993
994                 if (!sample->ip)
995                         return 0;
996
997                 /* x86 retpoline 'return' doesn't match the stack */
998                 if (rstate == X86_RETPOLINE_DETECTED && ts->cnt > 2 &&
999                     ts->stack[ts->cnt - 1].ret_addr != sample->addr)
1000                         return thread_stack__x86_retpoline(ts, sample, to_al);
1001
1002                 err = thread_stack__pop_cp(thread, ts, sample->addr,
1003                                            sample->time, ref, from_al->sym);
1004                 if (err) {
1005                         if (err < 0)
1006                                 return err;
1007                         err = thread_stack__no_call_return(thread, ts, sample,
1008                                                            from_al, to_al, ref);
1009                 }
1010         } else if (sample->flags & PERF_IP_FLAG_TRACE_BEGIN) {
1011                 err = thread_stack__trace_begin(thread, ts, sample->time, ref);
1012         } else if (sample->flags & PERF_IP_FLAG_TRACE_END) {
1013                 err = thread_stack__trace_end(ts, sample, ref);
1014         } else if (sample->flags & PERF_IP_FLAG_BRANCH &&
1015                    from_al->sym != to_al->sym && to_al->sym &&
1016                    to_al->addr == to_al->sym->start) {
1017                 struct call_path_root *cpr = ts->crp->cpr;
1018                 struct call_path *cp;
1019
1020                 /*
1021                  * The compiler might optimize a call/ret combination by making
1022                  * it a jmp. Make that visible by recording on the stack a
1023                  * branch to the start of a different symbol. Note, that means
1024                  * when a ret pops the stack, all jmps must be popped off first.
1025                  */
1026                 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp,
1027                                         to_al->sym, sample->addr,
1028                                         ts->kernel_start);
1029                 err = thread_stack__push_cp(ts, 0, sample->time, ref, cp, false,
1030                                             false);
1031                 if (!err)
1032                         ts->stack[ts->cnt - 1].non_call = true;
1033         }
1034
1035         return err;
1036 }
1037
1038 size_t thread_stack__depth(struct thread *thread, int cpu)
1039 {
1040         struct thread_stack *ts = thread__stack(thread, cpu);
1041
1042         if (!ts)
1043                 return 0;
1044         return ts->cnt;
1045 }