kernel/bpf/stackmap.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /* Copyright (c) 2016 Facebook
   3  */
   4 #include <linux/bpf.h>
   5 #include <linux/jhash.h>
   6 #include <linux/filter.h>
   7 #include <linux/kernel.h>
   8 #include <linux/stacktrace.h>
   9 #include <linux/perf_event.h>
  10 #include <linux/elf.h>
  11 #include <linux/pagemap.h>
  12 #include <linux/irq_work.h>
  13 #include <linux/btf_ids.h>
  14 #include "percpu_freelist.h"
  15
  16 #define STACK_CREATE_FLAG_MASK                                  \
  17         (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |        \
  18          BPF_F_STACK_BUILD_ID)
  19
  20 struct stack_map_bucket {
  21         struct pcpu_freelist_node fnode;
  22         u32 hash;
  23         u32 nr;
  24         u64 data[];
  25 };
  26
  27 struct bpf_stack_map {
  28         struct bpf_map map;
  29         void *elems;
  30         struct pcpu_freelist freelist;
  31         u32 n_buckets;
  32         struct stack_map_bucket *buckets[];
  33 };
  34
  35 /* irq_work to run up_read() for build_id lookup in nmi context */
  36 struct stack_map_irq_work {
  37         struct irq_work irq_work;
  38         struct mm_struct *mm;
  39 };
  40
  41 static void do_up_read(struct irq_work *entry)
  42 {
  43         struct stack_map_irq_work *work;
  44
  45         if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
  46                 return;
  47
  48         work = container_of(entry, struct stack_map_irq_work, irq_work);
  49         mmap_read_unlock_non_owner(work->mm);
  50 }
  51
  52 static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
  53
  54 static inline bool stack_map_use_build_id(struct bpf_map *map)
  55 {
  56         return (map->map_flags & BPF_F_STACK_BUILD_ID);
  57 }
  58
  59 static inline int stack_map_data_size(struct bpf_map *map)
  60 {
  61         return stack_map_use_build_id(map) ?
  62                 sizeof(struct bpf_stack_build_id) : sizeof(u64);
  63 }
  64
  65 static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
  66 {
  67         u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
  68         int err;
  69
  70         smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
  71                                          smap->map.numa_node);
  72         if (!smap->elems)
  73                 return -ENOMEM;
  74
  75         err = pcpu_freelist_init(&smap->freelist);
  76         if (err)
  77                 goto free_elems;
  78
  79         pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
  80                                smap->map.max_entries);
  81         return 0;
  82
  83 free_elems:
  84         bpf_map_area_free(smap->elems);
  85         return err;
  86 }
  87
  88 /* Called from syscall */
  89 static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
  90 {
  91         u32 value_size = attr->value_size;
  92         struct bpf_stack_map *smap;
  93         struct bpf_map_memory mem;
  94         u64 cost, n_buckets;
  95         int err;
  96
  97         if (!bpf_capable())
  98                 return ERR_PTR(-EPERM);
  99
 100         if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
 101                 return ERR_PTR(-EINVAL);
 102
 103         /* check sanity of attributes */
 104         if (attr->max_entries == 0 || attr->key_size != 4 ||
 105             value_size < 8 || value_size % 8)
 106                 return ERR_PTR(-EINVAL);
 107
 108         BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
 109         if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
 110                 if (value_size % sizeof(struct bpf_stack_build_id) ||
 111                     value_size / sizeof(struct bpf_stack_build_id)
 112                     > sysctl_perf_event_max_stack)
 113                         return ERR_PTR(-EINVAL);
 114         } else if (value_size / 8 > sysctl_perf_event_max_stack)
 115                 return ERR_PTR(-EINVAL);
 116
 117         /* hash table size must be power of 2 */
 118         n_buckets = roundup_pow_of_two(attr->max_entries);
 119
 120         cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
 121         cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
 122         err = bpf_map_charge_init(&mem, cost);
 123         if (err)
 124                 return ERR_PTR(err);
 125
 126         smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
 127         if (!smap) {
 128                 bpf_map_charge_finish(&mem);
 129                 return ERR_PTR(-ENOMEM);
 130         }
 131
 132         bpf_map_init_from_attr(&smap->map, attr);
 133         smap->map.value_size = value_size;
 134         smap->n_buckets = n_buckets;
 135
 136         err = get_callchain_buffers(sysctl_perf_event_max_stack);
 137         if (err)
 138                 goto free_charge;
 139
 140         err = prealloc_elems_and_freelist(smap);
 141         if (err)
 142                 goto put_buffers;
 143
 144         bpf_map_charge_move(&smap->map.memory, &mem);
 145
 146         return &smap->map;
 147
 148 put_buffers:
 149         put_callchain_buffers();
 150 free_charge:
 151         bpf_map_charge_finish(&mem);
 152         bpf_map_area_free(smap);
 153         return ERR_PTR(err);
 154 }
 155
 156 #define BPF_BUILD_ID 3
 157 /*
 158  * Parse build id from the note segment. This logic can be shared between
 159  * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
 160  * identical.
 161  */
 162 static inline int stack_map_parse_build_id(void *page_addr,
 163                                            unsigned char *build_id,
 164                                            void *note_start,
 165                                            Elf32_Word note_size)
 166 {
 167         Elf32_Word note_offs = 0, new_offs;
 168
 169         /* check for overflow */
 170         if (note_start < page_addr || note_start + note_size < note_start)
 171                 return -EINVAL;
 172
 173         /* only supports note that fits in the first page */
 174         if (note_start + note_size > page_addr + PAGE_SIZE)
 175                 return -EINVAL;
 176
 177         while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
 178                 Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
 179
 180                 if (nhdr->n_type == BPF_BUILD_ID &&
 181                     nhdr->n_namesz == sizeof("GNU") &&
 182                     nhdr->n_descsz > 0 &&
 183                     nhdr->n_descsz <= BPF_BUILD_ID_SIZE) {
 184                         memcpy(build_id,
 185                                note_start + note_offs +
 186                                ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
 187                                nhdr->n_descsz);
 188                         memset(build_id + nhdr->n_descsz, 0,
 189                                BPF_BUILD_ID_SIZE - nhdr->n_descsz);
 190                         return 0;
 191                 }
 192                 new_offs = note_offs + sizeof(Elf32_Nhdr) +
 193                         ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
 194                 if (new_offs <= note_offs)  /* overflow */
 195                         break;
 196                 note_offs = new_offs;
 197         }
 198         return -EINVAL;
 199 }
 200
 201 /* Parse build ID from 32-bit ELF */
 202 static int stack_map_get_build_id_32(void *page_addr,
 203                                      unsigned char *build_id)
 204 {
 205         Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
 206         Elf32_Phdr *phdr;
 207         int i;
 208
 209         /* only supports phdr that fits in one page */
 210         if (ehdr->e_phnum >
 211             (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
 212                 return -EINVAL;
 213
 214         phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
 215
 216         for (i = 0; i < ehdr->e_phnum; ++i)
 217                 if (phdr[i].p_type == PT_NOTE)
 218                         return stack_map_parse_build_id(page_addr, build_id,
 219                                         page_addr + phdr[i].p_offset,
 220                                         phdr[i].p_filesz);
 221         return -EINVAL;
 222 }
 223
 224 /* Parse build ID from 64-bit ELF */
 225 static int stack_map_get_build_id_64(void *page_addr,
 226                                      unsigned char *build_id)
 227 {
 228         Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
 229         Elf64_Phdr *phdr;
 230         int i;
 231
 232         /* only supports phdr that fits in one page */
 233         if (ehdr->e_phnum >
 234             (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
 235                 return -EINVAL;
 236
 237         phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
 238
 239         for (i = 0; i < ehdr->e_phnum; ++i)
 240                 if (phdr[i].p_type == PT_NOTE)
 241                         return stack_map_parse_build_id(page_addr, build_id,
 242                                         page_addr + phdr[i].p_offset,
 243                                         phdr[i].p_filesz);
 244         return -EINVAL;
 245 }
 246
 247 /* Parse build ID of ELF file mapped to vma */
 248 static int stack_map_get_build_id(struct vm_area_struct *vma,
 249                                   unsigned char *build_id)
 250 {
 251         Elf32_Ehdr *ehdr;
 252         struct page *page;
 253         void *page_addr;
 254         int ret;
 255
 256         /* only works for page backed storage  */
 257         if (!vma->vm_file)
 258                 return -EINVAL;
 259
 260         page = find_get_page(vma->vm_file->f_mapping, 0);
 261         if (!page)
 262                 return -EFAULT; /* page not mapped */
 263
 264         ret = -EINVAL;
 265         page_addr = kmap_atomic(page);
 266         ehdr = (Elf32_Ehdr *)page_addr;
 267
 268         /* compare magic x7f "ELF" */
 269         if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
 270                 goto out;
 271
 272         /* only support executable file and shared object file */
 273         if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
 274                 goto out;
 275
 276         if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
 277                 ret = stack_map_get_build_id_32(page_addr, build_id);
 278         else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
 279                 ret = stack_map_get_build_id_64(page_addr, build_id);
 280 out:
 281         kunmap_atomic(page_addr);
 282         put_page(page);
 283         return ret;
 284 }
 285
 286 static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 287                                           u64 *ips, u32 trace_nr, bool user)
 288 {
 289         int i;
 290         struct vm_area_struct *vma;
 291         bool irq_work_busy = false;
 292         struct stack_map_irq_work *work = NULL;
 293
 294         if (irqs_disabled()) {
 295                 if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
 296                         work = this_cpu_ptr(&up_read_work);
 297                         if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) {
 298                                 /* cannot queue more up_read, fallback */
 299                                 irq_work_busy = true;
 300                         }
 301                 } else {
 302                         /*
 303                          * PREEMPT_RT does not allow to trylock mmap sem in
 304                          * interrupt disabled context. Force the fallback code.
 305                          */
 306                         irq_work_busy = true;
 307                 }
 308         }
 309
 310         /*
 311          * We cannot do up_read() when the irq is disabled, because of
 312          * risk to deadlock with rq_lock. To do build_id lookup when the
 313          * irqs are disabled, we need to run up_read() in irq_work. We use
 314          * a percpu variable to do the irq_work. If the irq_work is
 315          * already used by another lookup, we fall back to report ips.
 316          *
 317          * Same fallback is used for kernel stack (!user) on a stackmap
 318          * with build_id.
 319          */
 320         if (!user || !current || !current->mm || irq_work_busy ||
 321             !mmap_read_trylock_non_owner(current->mm)) {
 322                 /* cannot access current->mm, fall back to ips */
 323                 for (i = 0; i < trace_nr; i++) {
 324                         id_offs[i].status = BPF_STACK_BUILD_ID_IP;
 325                         id_offs[i].ip = ips[i];
 326                         memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE);
 327                 }
 328                 return;
 329         }
 330
 331         for (i = 0; i < trace_nr; i++) {
 332                 vma = find_vma(current->mm, ips[i]);
 333                 if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) {
 334                         /* per entry fall back to ips */
 335                         id_offs[i].status = BPF_STACK_BUILD_ID_IP;
 336                         id_offs[i].ip = ips[i];
 337                         memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE);
 338                         continue;
 339                 }
 340                 id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
 341                         - vma->vm_start;
 342                 id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
 343         }
 344
 345         if (!work) {
 346                 mmap_read_unlock_non_owner(current->mm);
 347         } else {
 348                 work->mm = current->mm;
 349                 irq_work_queue(&work->irq_work);
 350         }
 351 }
 352
 353 static struct perf_callchain_entry *
 354 get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
 355 {
 356 #ifdef CONFIG_STACKTRACE
 357         struct perf_callchain_entry *entry;
 358         int rctx;
 359
 360         entry = get_callchain_entry(&rctx);
 361
 362         if (!entry)
 363                 return NULL;
 364
 365         entry->nr = init_nr +
 366                 stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr),
 367                                      sysctl_perf_event_max_stack - init_nr, 0);
 368
 369         /* stack_trace_save_tsk() works on unsigned long array, while
 370          * perf_callchain_entry uses u64 array. For 32-bit systems, it is
 371          * necessary to fix this mismatch.
 372          */
 373         if (__BITS_PER_LONG != 64) {
 374                 unsigned long *from = (unsigned long *) entry->ip;
 375                 u64 *to = entry->ip;
 376                 int i;
 377
 378                 /* copy data from the end to avoid using extra buffer */
 379                 for (i = entry->nr - 1; i >= (int)init_nr; i--)
 380                         to[i] = (u64)(from[i]);
 381         }
 382
 383         put_callchain_entry(rctx);
 384
 385         return entry;
 386 #else /* CONFIG_STACKTRACE */
 387         return NULL;
 388 #endif
 389 }
 390
 391 static long __bpf_get_stackid(struct bpf_map *map,
 392                               struct perf_callchain_entry *trace, u64 flags)
 393 {
 394         struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 395         struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
 396         u32 max_depth = map->value_size / stack_map_data_size(map);
 397         /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
 398         u32 init_nr = sysctl_perf_event_max_stack - max_depth;
 399         u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 400         u32 hash, id, trace_nr, trace_len;
 401         bool user = flags & BPF_F_USER_STACK;
 402         u64 *ips;
 403         bool hash_matches;
 404
 405         /* get_perf_callchain() guarantees that trace->nr >= init_nr
 406          * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
 407          */
 408         trace_nr = trace->nr - init_nr;
 409
 410         if (trace_nr <= skip)
 411                 /* skipping more than usable stack trace */
 412                 return -EFAULT;
 413
 414         trace_nr -= skip;
 415         trace_len = trace_nr * sizeof(u64);
 416         ips = trace->ip + skip + init_nr;
 417         hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
 418         id = hash & (smap->n_buckets - 1);
 419         bucket = READ_ONCE(smap->buckets[id]);
 420
 421         hash_matches = bucket && bucket->hash == hash;
 422         /* fast cmp */
 423         if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
 424                 return id;
 425
 426         if (stack_map_use_build_id(map)) {
 427                 /* for build_id+offset, pop a bucket before slow cmp */
 428                 new_bucket = (struct stack_map_bucket *)
 429                         pcpu_freelist_pop(&smap->freelist);
 430                 if (unlikely(!new_bucket))
 431                         return -ENOMEM;
 432                 new_bucket->nr = trace_nr;
 433                 stack_map_get_build_id_offset(
 434                         (struct bpf_stack_build_id *)new_bucket->data,
 435                         ips, trace_nr, user);
 436                 trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
 437                 if (hash_matches && bucket->nr == trace_nr &&
 438                     memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
 439                         pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
 440                         return id;
 441                 }
 442                 if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
 443                         pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
 444                         return -EEXIST;
 445                 }
 446         } else {
 447                 if (hash_matches && bucket->nr == trace_nr &&
 448                     memcmp(bucket->data, ips, trace_len) == 0)
 449                         return id;
 450                 if (bucket && !(flags & BPF_F_REUSE_STACKID))
 451                         return -EEXIST;
 452
 453                 new_bucket = (struct stack_map_bucket *)
 454                         pcpu_freelist_pop(&smap->freelist);
 455                 if (unlikely(!new_bucket))
 456                         return -ENOMEM;
 457                 memcpy(new_bucket->data, ips, trace_len);
 458         }
 459
 460         new_bucket->hash = hash;
 461         new_bucket->nr = trace_nr;
 462
 463         old_bucket = xchg(&smap->buckets[id], new_bucket);
 464         if (old_bucket)
 465                 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
 466         return id;
 467 }
 468
 469 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 470            u64, flags)
 471 {
 472         u32 max_depth = map->value_size / stack_map_data_size(map);
 473         /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
 474         u32 init_nr = sysctl_perf_event_max_stack - max_depth;
 475         bool user = flags & BPF_F_USER_STACK;
 476         struct perf_callchain_entry *trace;
 477         bool kernel = !user;
 478
 479         if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
 480                                BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
 481                 return -EINVAL;
 482
 483         trace = get_perf_callchain(regs, init_nr, kernel, user,
 484                                    sysctl_perf_event_max_stack, false, false);
 485
 486         if (unlikely(!trace))
 487                 /* couldn't fetch the stack trace */
 488                 return -EFAULT;
 489
 490         return __bpf_get_stackid(map, trace, flags);
 491 }
 492
 493 const struct bpf_func_proto bpf_get_stackid_proto = {
 494         .func           = bpf_get_stackid,
 495         .gpl_only       = true,
 496         .ret_type       = RET_INTEGER,
 497         .arg1_type      = ARG_PTR_TO_CTX,
 498         .arg2_type      = ARG_CONST_MAP_PTR,
 499         .arg3_type      = ARG_ANYTHING,
 500 };
 501
 502 static __u64 count_kernel_ip(struct perf_callchain_entry *trace)
 503 {
 504         __u64 nr_kernel = 0;
 505
 506         while (nr_kernel < trace->nr) {
 507                 if (trace->ip[nr_kernel] == PERF_CONTEXT_USER)
 508                         break;
 509                 nr_kernel++;
 510         }
 511         return nr_kernel;
 512 }
 513
 514 BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
 515            struct bpf_map *, map, u64, flags)
 516 {
 517         struct perf_event *event = ctx->event;
 518         struct perf_callchain_entry *trace;
 519         bool kernel, user;
 520         __u64 nr_kernel;
 521         int ret;
 522
 523         /* perf_sample_data doesn't have callchain, use bpf_get_stackid */
 524         if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
 525                 return bpf_get_stackid((unsigned long)(ctx->regs),
 526                                        (unsigned long) map, flags, 0, 0);
 527
 528         if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
 529                                BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
 530                 return -EINVAL;
 531
 532         user = flags & BPF_F_USER_STACK;
 533         kernel = !user;
 534
 535         trace = ctx->data->callchain;
 536         if (unlikely(!trace))
 537                 return -EFAULT;
 538
 539         nr_kernel = count_kernel_ip(trace);
 540
 541         if (kernel) {
 542                 __u64 nr = trace->nr;
 543
 544                 trace->nr = nr_kernel;
 545                 ret = __bpf_get_stackid(map, trace, flags);
 546
 547                 /* restore nr */
 548                 trace->nr = nr;
 549         } else { /* user */
 550                 u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
 551
 552                 skip += nr_kernel;
 553                 if (skip > BPF_F_SKIP_FIELD_MASK)
 554                         return -EFAULT;
 555
 556                 flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
 557                 ret = __bpf_get_stackid(map, trace, flags);
 558         }
 559         return ret;
 560 }
 561
 562 const struct bpf_func_proto bpf_get_stackid_proto_pe = {
 563         .func           = bpf_get_stackid_pe,
 564         .gpl_only       = false,
 565         .ret_type       = RET_INTEGER,
 566         .arg1_type      = ARG_PTR_TO_CTX,
 567         .arg2_type      = ARG_CONST_MAP_PTR,
 568         .arg3_type      = ARG_ANYTHING,
 569 };
 570
 571 static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 572                             struct perf_callchain_entry *trace_in,
 573                             void *buf, u32 size, u64 flags)
 574 {
 575         u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
 576         bool user_build_id = flags & BPF_F_USER_BUILD_ID;
 577         u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 578         bool user = flags & BPF_F_USER_STACK;
 579         struct perf_callchain_entry *trace;
 580         bool kernel = !user;
 581         int err = -EINVAL;
 582         u64 *ips;
 583
 584         if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
 585                                BPF_F_USER_BUILD_ID)))
 586                 goto clear;
 587         if (kernel && user_build_id)
 588                 goto clear;
 589
 590         elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
 591                                             : sizeof(u64);
 592         if (unlikely(size % elem_size))
 593                 goto clear;
 594
 595         /* cannot get valid user stack for task without user_mode regs */
 596         if (task && user && !user_mode(regs))
 597                 goto err_fault;
 598
 599         num_elem = size / elem_size;
 600         if (sysctl_perf_event_max_stack < num_elem)
 601                 init_nr = 0;
 602         else
 603                 init_nr = sysctl_perf_event_max_stack - num_elem;
 604
 605         if (trace_in)
 606                 trace = trace_in;
 607         else if (kernel && task)
 608                 trace = get_callchain_entry_for_task(task, init_nr);
 609         else
 610                 trace = get_perf_callchain(regs, init_nr, kernel, user,
 611                                            sysctl_perf_event_max_stack,
 612                                            false, false);
 613         if (unlikely(!trace))
 614                 goto err_fault;
 615
 616         trace_nr = trace->nr - init_nr;
 617         if (trace_nr < skip)
 618                 goto err_fault;
 619
 620         trace_nr -= skip;
 621         trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
 622         copy_len = trace_nr * elem_size;
 623         ips = trace->ip + skip + init_nr;
 624         if (user && user_build_id)
 625                 stack_map_get_build_id_offset(buf, ips, trace_nr, user);
 626         else
 627                 memcpy(buf, ips, copy_len);
 628
 629         if (size > copy_len)
 630                 memset(buf + copy_len, 0, size - copy_len);
 631         return copy_len;
 632
 633 err_fault:
 634         err = -EFAULT;
 635 clear:
 636         memset(buf, 0, size);
 637         return err;
 638 }
 639
 640 BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
 641            u64, flags)
 642 {
 643         return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
 644 }
 645
 646 const struct bpf_func_proto bpf_get_stack_proto = {
 647         .func           = bpf_get_stack,
 648         .gpl_only       = true,
 649         .ret_type       = RET_INTEGER,
 650         .arg1_type      = ARG_PTR_TO_CTX,
 651         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
 652         .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
 653         .arg4_type      = ARG_ANYTHING,
 654 };
 655
 656 BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
 657            u32, size, u64, flags)
 658 {
 659         struct pt_regs *regs = task_pt_regs(task);
 660
 661         return __bpf_get_stack(regs, task, NULL, buf, size, flags);
 662 }
 663
 664 BTF_ID_LIST(bpf_get_task_stack_btf_ids)
 665 BTF_ID(struct, task_struct)
 666
 667 const struct bpf_func_proto bpf_get_task_stack_proto = {
 668         .func           = bpf_get_task_stack,
 669         .gpl_only       = false,
 670         .ret_type       = RET_INTEGER,
 671         .arg1_type      = ARG_PTR_TO_BTF_ID,
 672         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
 673         .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
 674         .arg4_type      = ARG_ANYTHING,
 675         .btf_id         = bpf_get_task_stack_btf_ids,
 676 };
 677
 678 BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
 679            void *, buf, u32, size, u64, flags)
 680 {
 681         struct pt_regs *regs = (struct pt_regs *)(ctx->regs);
 682         struct perf_event *event = ctx->event;
 683         struct perf_callchain_entry *trace;
 684         bool kernel, user;
 685         int err = -EINVAL;
 686         __u64 nr_kernel;
 687
 688         if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
 689                 return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
 690
 691         if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
 692                                BPF_F_USER_BUILD_ID)))
 693                 goto clear;
 694
 695         user = flags & BPF_F_USER_STACK;
 696         kernel = !user;
 697
 698         err = -EFAULT;
 699         trace = ctx->data->callchain;
 700         if (unlikely(!trace))
 701                 goto clear;
 702
 703         nr_kernel = count_kernel_ip(trace);
 704
 705         if (kernel) {
 706                 __u64 nr = trace->nr;
 707
 708                 trace->nr = nr_kernel;
 709                 err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
 710
 711                 /* restore nr */
 712                 trace->nr = nr;
 713         } else { /* user */
 714                 u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
 715
 716                 skip += nr_kernel;
 717                 if (skip > BPF_F_SKIP_FIELD_MASK)
 718                         goto clear;
 719
 720                 flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
 721                 err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
 722         }
 723         return err;
 724
 725 clear:
 726         memset(buf, 0, size);
 727         return err;
 728
 729 }
 730
 731 const struct bpf_func_proto bpf_get_stack_proto_pe = {
 732         .func           = bpf_get_stack_pe,
 733         .gpl_only       = true,
 734         .ret_type       = RET_INTEGER,
 735         .arg1_type      = ARG_PTR_TO_CTX,
 736         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
 737         .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
 738         .arg4_type      = ARG_ANYTHING,
 739 };
 740
 741 /* Called from eBPF program */
 742 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 743 {
 744         return ERR_PTR(-EOPNOTSUPP);
 745 }
 746
 747 /* Called from syscall */
 748 int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 749 {
 750         struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 751         struct stack_map_bucket *bucket, *old_bucket;
 752         u32 id = *(u32 *)key, trace_len;
 753
 754         if (unlikely(id >= smap->n_buckets))
 755                 return -ENOENT;
 756
 757         bucket = xchg(&smap->buckets[id], NULL);
 758         if (!bucket)
 759                 return -ENOENT;
 760
 761         trace_len = bucket->nr * stack_map_data_size(map);
 762         memcpy(value, bucket->data, trace_len);
 763         memset(value + trace_len, 0, map->value_size - trace_len);
 764
 765         old_bucket = xchg(&smap->buckets[id], bucket);
 766         if (old_bucket)
 767                 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
 768         return 0;
 769 }
 770
 771 static int stack_map_get_next_key(struct bpf_map *map, void *key,
 772                                   void *next_key)
 773 {
 774         struct bpf_stack_map *smap = container_of(map,
 775                                                   struct bpf_stack_map, map);
 776         u32 id;
 777
 778         WARN_ON_ONCE(!rcu_read_lock_held());
 779
 780         if (!key) {
 781                 id = 0;
 782         } else {
 783                 id = *(u32 *)key;
 784                 if (id >= smap->n_buckets || !smap->buckets[id])
 785                         id = 0;
 786                 else
 787                         id++;
 788         }
 789
 790         while (id < smap->n_buckets && !smap->buckets[id])
 791                 id++;
 792
 793         if (id >= smap->n_buckets)
 794                 return -ENOENT;
 795
 796         *(u32 *)next_key = id;
 797         return 0;
 798 }
 799
 800 static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
 801                                  u64 map_flags)
 802 {
 803         return -EINVAL;
 804 }
 805
 806 /* Called from syscall or from eBPF program */
 807 static int stack_map_delete_elem(struct bpf_map *map, void *key)
 808 {
 809         struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 810         struct stack_map_bucket *old_bucket;
 811         u32 id = *(u32 *)key;
 812
 813         if (unlikely(id >= smap->n_buckets))
 814                 return -E2BIG;
 815
 816         old_bucket = xchg(&smap->buckets[id], NULL);
 817         if (old_bucket) {
 818                 pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
 819                 return 0;
 820         } else {
 821                 return -ENOENT;
 822         }
 823 }
 824
 825 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
 826 static void stack_map_free(struct bpf_map *map)
 827 {
 828         struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 829
 830         bpf_map_area_free(smap->elems);
 831         pcpu_freelist_destroy(&smap->freelist);
 832         bpf_map_area_free(smap);
 833         put_callchain_buffers();
 834 }
 835
 836 static int stack_trace_map_btf_id;
 837 const struct bpf_map_ops stack_trace_map_ops = {
 838         .map_alloc = stack_map_alloc,
 839         .map_free = stack_map_free,
 840         .map_get_next_key = stack_map_get_next_key,
 841         .map_lookup_elem = stack_map_lookup_elem,
 842         .map_update_elem = stack_map_update_elem,
 843         .map_delete_elem = stack_map_delete_elem,
 844         .map_check_btf = map_check_no_btf,
 845         .map_btf_name = "bpf_stack_map",
 846         .map_btf_id = &stack_trace_map_btf_id,
 847 };
 848
 849 static int __init stack_map_init(void)
 850 {
 851         int cpu;
 852         struct stack_map_irq_work *work;
 853
 854         for_each_possible_cpu(cpu) {
 855                 work = per_cpu_ptr(&up_read_work, cpu);
 856                 init_irq_work(&work->irq_work, do_up_read);
 857         }
 858         return 0;
 859 }
 860 subsys_initcall(stack_map_init);