Merge tag 'tif-task_work.arch-2020-12-14' of git://git.kernel.dk/linux-block
[linux-2.6-microblaze.git] / kernel / bpf / stackmap.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2016 Facebook
3  */
4 #include <linux/bpf.h>
5 #include <linux/jhash.h>
6 #include <linux/filter.h>
7 #include <linux/kernel.h>
8 #include <linux/stacktrace.h>
9 #include <linux/perf_event.h>
10 #include <linux/elf.h>
11 #include <linux/pagemap.h>
12 #include <linux/irq_work.h>
13 #include <linux/btf_ids.h>
14 #include "percpu_freelist.h"
15
16 #define STACK_CREATE_FLAG_MASK                                  \
17         (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |        \
18          BPF_F_STACK_BUILD_ID)
19
20 struct stack_map_bucket {
21         struct pcpu_freelist_node fnode;
22         u32 hash;
23         u32 nr;
24         u64 data[];
25 };
26
27 struct bpf_stack_map {
28         struct bpf_map map;
29         void *elems;
30         struct pcpu_freelist freelist;
31         u32 n_buckets;
32         struct stack_map_bucket *buckets[];
33 };
34
35 /* irq_work to run up_read() for build_id lookup in nmi context */
36 struct stack_map_irq_work {
37         struct irq_work irq_work;
38         struct mm_struct *mm;
39 };
40
41 static void do_up_read(struct irq_work *entry)
42 {
43         struct stack_map_irq_work *work;
44
45         if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
46                 return;
47
48         work = container_of(entry, struct stack_map_irq_work, irq_work);
49         mmap_read_unlock_non_owner(work->mm);
50 }
51
52 static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
53
54 static inline bool stack_map_use_build_id(struct bpf_map *map)
55 {
56         return (map->map_flags & BPF_F_STACK_BUILD_ID);
57 }
58
59 static inline int stack_map_data_size(struct bpf_map *map)
60 {
61         return stack_map_use_build_id(map) ?
62                 sizeof(struct bpf_stack_build_id) : sizeof(u64);
63 }
64
65 static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
66 {
67         u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
68         int err;
69
70         smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
71                                          smap->map.numa_node);
72         if (!smap->elems)
73                 return -ENOMEM;
74
75         err = pcpu_freelist_init(&smap->freelist);
76         if (err)
77                 goto free_elems;
78
79         pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
80                                smap->map.max_entries);
81         return 0;
82
83 free_elems:
84         bpf_map_area_free(smap->elems);
85         return err;
86 }
87
88 /* Called from syscall */
89 static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
90 {
91         u32 value_size = attr->value_size;
92         struct bpf_stack_map *smap;
93         u64 cost, n_buckets;
94         int err;
95
96         if (!bpf_capable())
97                 return ERR_PTR(-EPERM);
98
99         if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
100                 return ERR_PTR(-EINVAL);
101
102         /* check sanity of attributes */
103         if (attr->max_entries == 0 || attr->key_size != 4 ||
104             value_size < 8 || value_size % 8)
105                 return ERR_PTR(-EINVAL);
106
107         BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
108         if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
109                 if (value_size % sizeof(struct bpf_stack_build_id) ||
110                     value_size / sizeof(struct bpf_stack_build_id)
111                     > sysctl_perf_event_max_stack)
112                         return ERR_PTR(-EINVAL);
113         } else if (value_size / 8 > sysctl_perf_event_max_stack)
114                 return ERR_PTR(-EINVAL);
115
116         /* hash table size must be power of 2 */
117         n_buckets = roundup_pow_of_two(attr->max_entries);
118
119         cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
120         cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
121         smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
122         if (!smap)
123                 return ERR_PTR(-ENOMEM);
124
125         bpf_map_init_from_attr(&smap->map, attr);
126         smap->map.value_size = value_size;
127         smap->n_buckets = n_buckets;
128
129         err = get_callchain_buffers(sysctl_perf_event_max_stack);
130         if (err)
131                 goto free_smap;
132
133         err = prealloc_elems_and_freelist(smap);
134         if (err)
135                 goto put_buffers;
136
137         return &smap->map;
138
139 put_buffers:
140         put_callchain_buffers();
141 free_smap:
142         bpf_map_area_free(smap);
143         return ERR_PTR(err);
144 }
145
146 #define BPF_BUILD_ID 3
147 /*
148  * Parse build id from the note segment. This logic can be shared between
149  * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
150  * identical.
151  */
152 static inline int stack_map_parse_build_id(void *page_addr,
153                                            unsigned char *build_id,
154                                            void *note_start,
155                                            Elf32_Word note_size)
156 {
157         Elf32_Word note_offs = 0, new_offs;
158
159         /* check for overflow */
160         if (note_start < page_addr || note_start + note_size < note_start)
161                 return -EINVAL;
162
163         /* only supports note that fits in the first page */
164         if (note_start + note_size > page_addr + PAGE_SIZE)
165                 return -EINVAL;
166
167         while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
168                 Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
169
170                 if (nhdr->n_type == BPF_BUILD_ID &&
171                     nhdr->n_namesz == sizeof("GNU") &&
172                     nhdr->n_descsz > 0 &&
173                     nhdr->n_descsz <= BPF_BUILD_ID_SIZE) {
174                         memcpy(build_id,
175                                note_start + note_offs +
176                                ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
177                                nhdr->n_descsz);
178                         memset(build_id + nhdr->n_descsz, 0,
179                                BPF_BUILD_ID_SIZE - nhdr->n_descsz);
180                         return 0;
181                 }
182                 new_offs = note_offs + sizeof(Elf32_Nhdr) +
183                         ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
184                 if (new_offs <= note_offs)  /* overflow */
185                         break;
186                 note_offs = new_offs;
187         }
188         return -EINVAL;
189 }
190
191 /* Parse build ID from 32-bit ELF */
192 static int stack_map_get_build_id_32(void *page_addr,
193                                      unsigned char *build_id)
194 {
195         Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
196         Elf32_Phdr *phdr;
197         int i;
198
199         /* only supports phdr that fits in one page */
200         if (ehdr->e_phnum >
201             (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
202                 return -EINVAL;
203
204         phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
205
206         for (i = 0; i < ehdr->e_phnum; ++i) {
207                 if (phdr[i].p_type == PT_NOTE &&
208                     !stack_map_parse_build_id(page_addr, build_id,
209                                               page_addr + phdr[i].p_offset,
210                                               phdr[i].p_filesz))
211                         return 0;
212         }
213         return -EINVAL;
214 }
215
216 /* Parse build ID from 64-bit ELF */
217 static int stack_map_get_build_id_64(void *page_addr,
218                                      unsigned char *build_id)
219 {
220         Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
221         Elf64_Phdr *phdr;
222         int i;
223
224         /* only supports phdr that fits in one page */
225         if (ehdr->e_phnum >
226             (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
227                 return -EINVAL;
228
229         phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
230
231         for (i = 0; i < ehdr->e_phnum; ++i) {
232                 if (phdr[i].p_type == PT_NOTE &&
233                     !stack_map_parse_build_id(page_addr, build_id,
234                                               page_addr + phdr[i].p_offset,
235                                               phdr[i].p_filesz))
236                         return 0;
237         }
238         return -EINVAL;
239 }
240
241 /* Parse build ID of ELF file mapped to vma */
242 static int stack_map_get_build_id(struct vm_area_struct *vma,
243                                   unsigned char *build_id)
244 {
245         Elf32_Ehdr *ehdr;
246         struct page *page;
247         void *page_addr;
248         int ret;
249
250         /* only works for page backed storage  */
251         if (!vma->vm_file)
252                 return -EINVAL;
253
254         page = find_get_page(vma->vm_file->f_mapping, 0);
255         if (!page)
256                 return -EFAULT; /* page not mapped */
257
258         ret = -EINVAL;
259         page_addr = kmap_atomic(page);
260         ehdr = (Elf32_Ehdr *)page_addr;
261
262         /* compare magic x7f "ELF" */
263         if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
264                 goto out;
265
266         /* only support executable file and shared object file */
267         if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
268                 goto out;
269
270         if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
271                 ret = stack_map_get_build_id_32(page_addr, build_id);
272         else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
273                 ret = stack_map_get_build_id_64(page_addr, build_id);
274 out:
275         kunmap_atomic(page_addr);
276         put_page(page);
277         return ret;
278 }
279
280 static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
281                                           u64 *ips, u32 trace_nr, bool user)
282 {
283         int i;
284         struct vm_area_struct *vma;
285         bool irq_work_busy = false;
286         struct stack_map_irq_work *work = NULL;
287
288         if (irqs_disabled()) {
289                 if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
290                         work = this_cpu_ptr(&up_read_work);
291                         if (irq_work_is_busy(&work->irq_work)) {
292                                 /* cannot queue more up_read, fallback */
293                                 irq_work_busy = true;
294                         }
295                 } else {
296                         /*
297                          * PREEMPT_RT does not allow to trylock mmap sem in
298                          * interrupt disabled context. Force the fallback code.
299                          */
300                         irq_work_busy = true;
301                 }
302         }
303
304         /*
305          * We cannot do up_read() when the irq is disabled, because of
306          * risk to deadlock with rq_lock. To do build_id lookup when the
307          * irqs are disabled, we need to run up_read() in irq_work. We use
308          * a percpu variable to do the irq_work. If the irq_work is
309          * already used by another lookup, we fall back to report ips.
310          *
311          * Same fallback is used for kernel stack (!user) on a stackmap
312          * with build_id.
313          */
314         if (!user || !current || !current->mm || irq_work_busy ||
315             !mmap_read_trylock_non_owner(current->mm)) {
316                 /* cannot access current->mm, fall back to ips */
317                 for (i = 0; i < trace_nr; i++) {
318                         id_offs[i].status = BPF_STACK_BUILD_ID_IP;
319                         id_offs[i].ip = ips[i];
320                         memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE);
321                 }
322                 return;
323         }
324
325         for (i = 0; i < trace_nr; i++) {
326                 vma = find_vma(current->mm, ips[i]);
327                 if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) {
328                         /* per entry fall back to ips */
329                         id_offs[i].status = BPF_STACK_BUILD_ID_IP;
330                         id_offs[i].ip = ips[i];
331                         memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE);
332                         continue;
333                 }
334                 id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
335                         - vma->vm_start;
336                 id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
337         }
338
339         if (!work) {
340                 mmap_read_unlock_non_owner(current->mm);
341         } else {
342                 work->mm = current->mm;
343                 irq_work_queue(&work->irq_work);
344         }
345 }
346
347 static struct perf_callchain_entry *
348 get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
349 {
350 #ifdef CONFIG_STACKTRACE
351         struct perf_callchain_entry *entry;
352         int rctx;
353
354         entry = get_callchain_entry(&rctx);
355
356         if (!entry)
357                 return NULL;
358
359         entry->nr = init_nr +
360                 stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr),
361                                      sysctl_perf_event_max_stack - init_nr, 0);
362
363         /* stack_trace_save_tsk() works on unsigned long array, while
364          * perf_callchain_entry uses u64 array. For 32-bit systems, it is
365          * necessary to fix this mismatch.
366          */
367         if (__BITS_PER_LONG != 64) {
368                 unsigned long *from = (unsigned long *) entry->ip;
369                 u64 *to = entry->ip;
370                 int i;
371
372                 /* copy data from the end to avoid using extra buffer */
373                 for (i = entry->nr - 1; i >= (int)init_nr; i--)
374                         to[i] = (u64)(from[i]);
375         }
376
377         put_callchain_entry(rctx);
378
379         return entry;
380 #else /* CONFIG_STACKTRACE */
381         return NULL;
382 #endif
383 }
384
385 static long __bpf_get_stackid(struct bpf_map *map,
386                               struct perf_callchain_entry *trace, u64 flags)
387 {
388         struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
389         struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
390         u32 max_depth = map->value_size / stack_map_data_size(map);
391         /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
392         u32 init_nr = sysctl_perf_event_max_stack - max_depth;
393         u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
394         u32 hash, id, trace_nr, trace_len;
395         bool user = flags & BPF_F_USER_STACK;
396         u64 *ips;
397         bool hash_matches;
398
399         /* get_perf_callchain() guarantees that trace->nr >= init_nr
400          * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
401          */
402         trace_nr = trace->nr - init_nr;
403
404         if (trace_nr <= skip)
405                 /* skipping more than usable stack trace */
406                 return -EFAULT;
407
408         trace_nr -= skip;
409         trace_len = trace_nr * sizeof(u64);
410         ips = trace->ip + skip + init_nr;
411         hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
412         id = hash & (smap->n_buckets - 1);
413         bucket = READ_ONCE(smap->buckets[id]);
414
415         hash_matches = bucket && bucket->hash == hash;
416         /* fast cmp */
417         if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
418                 return id;
419
420         if (stack_map_use_build_id(map)) {
421                 /* for build_id+offset, pop a bucket before slow cmp */
422                 new_bucket = (struct stack_map_bucket *)
423                         pcpu_freelist_pop(&smap->freelist);
424                 if (unlikely(!new_bucket))
425                         return -ENOMEM;
426                 new_bucket->nr = trace_nr;
427                 stack_map_get_build_id_offset(
428                         (struct bpf_stack_build_id *)new_bucket->data,
429                         ips, trace_nr, user);
430                 trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
431                 if (hash_matches && bucket->nr == trace_nr &&
432                     memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
433                         pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
434                         return id;
435                 }
436                 if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
437                         pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
438                         return -EEXIST;
439                 }
440         } else {
441                 if (hash_matches && bucket->nr == trace_nr &&
442                     memcmp(bucket->data, ips, trace_len) == 0)
443                         return id;
444                 if (bucket && !(flags & BPF_F_REUSE_STACKID))
445                         return -EEXIST;
446
447                 new_bucket = (struct stack_map_bucket *)
448                         pcpu_freelist_pop(&smap->freelist);
449                 if (unlikely(!new_bucket))
450                         return -ENOMEM;
451                 memcpy(new_bucket->data, ips, trace_len);
452         }
453
454         new_bucket->hash = hash;
455         new_bucket->nr = trace_nr;
456
457         old_bucket = xchg(&smap->buckets[id], new_bucket);
458         if (old_bucket)
459                 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
460         return id;
461 }
462
463 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
464            u64, flags)
465 {
466         u32 max_depth = map->value_size / stack_map_data_size(map);
467         /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
468         u32 init_nr = sysctl_perf_event_max_stack - max_depth;
469         bool user = flags & BPF_F_USER_STACK;
470         struct perf_callchain_entry *trace;
471         bool kernel = !user;
472
473         if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
474                                BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
475                 return -EINVAL;
476
477         trace = get_perf_callchain(regs, init_nr, kernel, user,
478                                    sysctl_perf_event_max_stack, false, false);
479
480         if (unlikely(!trace))
481                 /* couldn't fetch the stack trace */
482                 return -EFAULT;
483
484         return __bpf_get_stackid(map, trace, flags);
485 }
486
487 const struct bpf_func_proto bpf_get_stackid_proto = {
488         .func           = bpf_get_stackid,
489         .gpl_only       = true,
490         .ret_type       = RET_INTEGER,
491         .arg1_type      = ARG_PTR_TO_CTX,
492         .arg2_type      = ARG_CONST_MAP_PTR,
493         .arg3_type      = ARG_ANYTHING,
494 };
495
496 static __u64 count_kernel_ip(struct perf_callchain_entry *trace)
497 {
498         __u64 nr_kernel = 0;
499
500         while (nr_kernel < trace->nr) {
501                 if (trace->ip[nr_kernel] == PERF_CONTEXT_USER)
502                         break;
503                 nr_kernel++;
504         }
505         return nr_kernel;
506 }
507
508 BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
509            struct bpf_map *, map, u64, flags)
510 {
511         struct perf_event *event = ctx->event;
512         struct perf_callchain_entry *trace;
513         bool kernel, user;
514         __u64 nr_kernel;
515         int ret;
516
517         /* perf_sample_data doesn't have callchain, use bpf_get_stackid */
518         if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
519                 return bpf_get_stackid((unsigned long)(ctx->regs),
520                                        (unsigned long) map, flags, 0, 0);
521
522         if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
523                                BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
524                 return -EINVAL;
525
526         user = flags & BPF_F_USER_STACK;
527         kernel = !user;
528
529         trace = ctx->data->callchain;
530         if (unlikely(!trace))
531                 return -EFAULT;
532
533         nr_kernel = count_kernel_ip(trace);
534
535         if (kernel) {
536                 __u64 nr = trace->nr;
537
538                 trace->nr = nr_kernel;
539                 ret = __bpf_get_stackid(map, trace, flags);
540
541                 /* restore nr */
542                 trace->nr = nr;
543         } else { /* user */
544                 u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
545
546                 skip += nr_kernel;
547                 if (skip > BPF_F_SKIP_FIELD_MASK)
548                         return -EFAULT;
549
550                 flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
551                 ret = __bpf_get_stackid(map, trace, flags);
552         }
553         return ret;
554 }
555
556 const struct bpf_func_proto bpf_get_stackid_proto_pe = {
557         .func           = bpf_get_stackid_pe,
558         .gpl_only       = false,
559         .ret_type       = RET_INTEGER,
560         .arg1_type      = ARG_PTR_TO_CTX,
561         .arg2_type      = ARG_CONST_MAP_PTR,
562         .arg3_type      = ARG_ANYTHING,
563 };
564
565 static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
566                             struct perf_callchain_entry *trace_in,
567                             void *buf, u32 size, u64 flags)
568 {
569         u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
570         bool user_build_id = flags & BPF_F_USER_BUILD_ID;
571         u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
572         bool user = flags & BPF_F_USER_STACK;
573         struct perf_callchain_entry *trace;
574         bool kernel = !user;
575         int err = -EINVAL;
576         u64 *ips;
577
578         if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
579                                BPF_F_USER_BUILD_ID)))
580                 goto clear;
581         if (kernel && user_build_id)
582                 goto clear;
583
584         elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
585                                             : sizeof(u64);
586         if (unlikely(size % elem_size))
587                 goto clear;
588
589         /* cannot get valid user stack for task without user_mode regs */
590         if (task && user && !user_mode(regs))
591                 goto err_fault;
592
593         num_elem = size / elem_size;
594         if (sysctl_perf_event_max_stack < num_elem)
595                 init_nr = 0;
596         else
597                 init_nr = sysctl_perf_event_max_stack - num_elem;
598
599         if (trace_in)
600                 trace = trace_in;
601         else if (kernel && task)
602                 trace = get_callchain_entry_for_task(task, init_nr);
603         else
604                 trace = get_perf_callchain(regs, init_nr, kernel, user,
605                                            sysctl_perf_event_max_stack,
606                                            false, false);
607         if (unlikely(!trace))
608                 goto err_fault;
609
610         trace_nr = trace->nr - init_nr;
611         if (trace_nr < skip)
612                 goto err_fault;
613
614         trace_nr -= skip;
615         trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
616         copy_len = trace_nr * elem_size;
617         ips = trace->ip + skip + init_nr;
618         if (user && user_build_id)
619                 stack_map_get_build_id_offset(buf, ips, trace_nr, user);
620         else
621                 memcpy(buf, ips, copy_len);
622
623         if (size > copy_len)
624                 memset(buf + copy_len, 0, size - copy_len);
625         return copy_len;
626
627 err_fault:
628         err = -EFAULT;
629 clear:
630         memset(buf, 0, size);
631         return err;
632 }
633
634 BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
635            u64, flags)
636 {
637         return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
638 }
639
640 const struct bpf_func_proto bpf_get_stack_proto = {
641         .func           = bpf_get_stack,
642         .gpl_only       = true,
643         .ret_type       = RET_INTEGER,
644         .arg1_type      = ARG_PTR_TO_CTX,
645         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
646         .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
647         .arg4_type      = ARG_ANYTHING,
648 };
649
650 BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
651            u32, size, u64, flags)
652 {
653         struct pt_regs *regs = task_pt_regs(task);
654
655         return __bpf_get_stack(regs, task, NULL, buf, size, flags);
656 }
657
658 BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct)
659
660 const struct bpf_func_proto bpf_get_task_stack_proto = {
661         .func           = bpf_get_task_stack,
662         .gpl_only       = false,
663         .ret_type       = RET_INTEGER,
664         .arg1_type      = ARG_PTR_TO_BTF_ID,
665         .arg1_btf_id    = &bpf_get_task_stack_btf_ids[0],
666         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
667         .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
668         .arg4_type      = ARG_ANYTHING,
669 };
670
671 BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
672            void *, buf, u32, size, u64, flags)
673 {
674         struct pt_regs *regs = (struct pt_regs *)(ctx->regs);
675         struct perf_event *event = ctx->event;
676         struct perf_callchain_entry *trace;
677         bool kernel, user;
678         int err = -EINVAL;
679         __u64 nr_kernel;
680
681         if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
682                 return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
683
684         if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
685                                BPF_F_USER_BUILD_ID)))
686                 goto clear;
687
688         user = flags & BPF_F_USER_STACK;
689         kernel = !user;
690
691         err = -EFAULT;
692         trace = ctx->data->callchain;
693         if (unlikely(!trace))
694                 goto clear;
695
696         nr_kernel = count_kernel_ip(trace);
697
698         if (kernel) {
699                 __u64 nr = trace->nr;
700
701                 trace->nr = nr_kernel;
702                 err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
703
704                 /* restore nr */
705                 trace->nr = nr;
706         } else { /* user */
707                 u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
708
709                 skip += nr_kernel;
710                 if (skip > BPF_F_SKIP_FIELD_MASK)
711                         goto clear;
712
713                 flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
714                 err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
715         }
716         return err;
717
718 clear:
719         memset(buf, 0, size);
720         return err;
721
722 }
723
724 const struct bpf_func_proto bpf_get_stack_proto_pe = {
725         .func           = bpf_get_stack_pe,
726         .gpl_only       = true,
727         .ret_type       = RET_INTEGER,
728         .arg1_type      = ARG_PTR_TO_CTX,
729         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
730         .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
731         .arg4_type      = ARG_ANYTHING,
732 };
733
734 /* Called from eBPF program */
735 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
736 {
737         return ERR_PTR(-EOPNOTSUPP);
738 }
739
740 /* Called from syscall */
741 int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
742 {
743         struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
744         struct stack_map_bucket *bucket, *old_bucket;
745         u32 id = *(u32 *)key, trace_len;
746
747         if (unlikely(id >= smap->n_buckets))
748                 return -ENOENT;
749
750         bucket = xchg(&smap->buckets[id], NULL);
751         if (!bucket)
752                 return -ENOENT;
753
754         trace_len = bucket->nr * stack_map_data_size(map);
755         memcpy(value, bucket->data, trace_len);
756         memset(value + trace_len, 0, map->value_size - trace_len);
757
758         old_bucket = xchg(&smap->buckets[id], bucket);
759         if (old_bucket)
760                 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
761         return 0;
762 }
763
764 static int stack_map_get_next_key(struct bpf_map *map, void *key,
765                                   void *next_key)
766 {
767         struct bpf_stack_map *smap = container_of(map,
768                                                   struct bpf_stack_map, map);
769         u32 id;
770
771         WARN_ON_ONCE(!rcu_read_lock_held());
772
773         if (!key) {
774                 id = 0;
775         } else {
776                 id = *(u32 *)key;
777                 if (id >= smap->n_buckets || !smap->buckets[id])
778                         id = 0;
779                 else
780                         id++;
781         }
782
783         while (id < smap->n_buckets && !smap->buckets[id])
784                 id++;
785
786         if (id >= smap->n_buckets)
787                 return -ENOENT;
788
789         *(u32 *)next_key = id;
790         return 0;
791 }
792
793 static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
794                                  u64 map_flags)
795 {
796         return -EINVAL;
797 }
798
799 /* Called from syscall or from eBPF program */
800 static int stack_map_delete_elem(struct bpf_map *map, void *key)
801 {
802         struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
803         struct stack_map_bucket *old_bucket;
804         u32 id = *(u32 *)key;
805
806         if (unlikely(id >= smap->n_buckets))
807                 return -E2BIG;
808
809         old_bucket = xchg(&smap->buckets[id], NULL);
810         if (old_bucket) {
811                 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
812                 return 0;
813         } else {
814                 return -ENOENT;
815         }
816 }
817
818 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
819 static void stack_map_free(struct bpf_map *map)
820 {
821         struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
822
823         bpf_map_area_free(smap->elems);
824         pcpu_freelist_destroy(&smap->freelist);
825         bpf_map_area_free(smap);
826         put_callchain_buffers();
827 }
828
829 static int stack_trace_map_btf_id;
830 const struct bpf_map_ops stack_trace_map_ops = {
831         .map_meta_equal = bpf_map_meta_equal,
832         .map_alloc = stack_map_alloc,
833         .map_free = stack_map_free,
834         .map_get_next_key = stack_map_get_next_key,
835         .map_lookup_elem = stack_map_lookup_elem,
836         .map_update_elem = stack_map_update_elem,
837         .map_delete_elem = stack_map_delete_elem,
838         .map_check_btf = map_check_no_btf,
839         .map_btf_name = "bpf_stack_map",
840         .map_btf_id = &stack_trace_map_btf_id,
841 };
842
843 static int __init stack_map_init(void)
844 {
845         int cpu;
846         struct stack_map_irq_work *work;
847
848         for_each_possible_cpu(cpu) {
849                 work = per_cpu_ptr(&up_read_work, cpu);
850                 init_irq_work(&work->irq_work, do_up_read);
851         }
852         return 0;
853 }
854 subsys_initcall(stack_map_init);