kernel/bpf/trampoline.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /* Copyright (c) 2019 Facebook */
   3 #include <linux/hash.h>
   4 #include <linux/bpf.h>
   5 #include <linux/filter.h>
   6 #include <linux/ftrace.h>
   7 #include <linux/rbtree_latch.h>
   8 #include <linux/perf_event.h>
   9 #include <linux/btf.h>
  10 #include <linux/rcupdate_trace.h>
  11 #include <linux/rcupdate_wait.h>
  12
  13 /* dummy _ops. The verifier will operate on target program's ops. */
  14 const struct bpf_verifier_ops bpf_extension_verifier_ops = {
  15 };
  16 const struct bpf_prog_ops bpf_extension_prog_ops = {
  17 };
  18
  19 /* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */
  20 #define TRAMPOLINE_HASH_BITS 10
  21 #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
  22
  23 static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
  24
  25 /* serializes access to trampoline_table */
  26 static DEFINE_MUTEX(trampoline_mutex);
  27
  28 void *bpf_jit_alloc_exec_page(void)
  29 {
  30         void *image;
  31
  32         image = bpf_jit_alloc_exec(PAGE_SIZE);
  33         if (!image)
  34                 return NULL;
  35
  36         set_vm_flush_reset_perms(image);
  37         /* Keep image as writeable. The alternative is to keep flipping ro/rw
  38          * everytime new program is attached or detached.
  39          */
  40         set_memory_x((long)image, 1);
  41         return image;
  42 }
  43
  44 void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym)
  45 {
  46         ksym->start = (unsigned long) data;
  47         ksym->end = ksym->start + PAGE_SIZE;
  48         bpf_ksym_add(ksym);
  49         perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
  50                            PAGE_SIZE, false, ksym->name);
  51 }
  52
  53 void bpf_image_ksym_del(struct bpf_ksym *ksym)
  54 {
  55         bpf_ksym_del(ksym);
  56         perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
  57                            PAGE_SIZE, true, ksym->name);
  58 }
  59
  60 static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr)
  61 {
  62         struct bpf_ksym *ksym = &tr->ksym;
  63
  64         snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", tr->key);
  65         bpf_image_ksym_add(tr->image, ksym);
  66 }
  67
  68 static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
  69 {
  70         struct bpf_trampoline *tr;
  71         struct hlist_head *head;
  72         void *image;
  73         int i;
  74
  75         mutex_lock(&trampoline_mutex);
  76         head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
  77         hlist_for_each_entry(tr, head, hlist) {
  78                 if (tr->key == key) {
  79                         refcount_inc(&tr->refcnt);
  80                         goto out;
  81                 }
  82         }
  83         tr = kzalloc(sizeof(*tr), GFP_KERNEL);
  84         if (!tr)
  85                 goto out;
  86
  87         /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
  88         image = bpf_jit_alloc_exec_page();
  89         if (!image) {
  90                 kfree(tr);
  91                 tr = NULL;
  92                 goto out;
  93         }
  94
  95         tr->key = key;
  96         INIT_HLIST_NODE(&tr->hlist);
  97         hlist_add_head(&tr->hlist, head);
  98         refcount_set(&tr->refcnt, 1);
  99         mutex_init(&tr->mutex);
 100         for (i = 0; i < BPF_TRAMP_MAX; i++)
 101                 INIT_HLIST_HEAD(&tr->progs_hlist[i]);
 102         tr->image = image;
 103         INIT_LIST_HEAD_RCU(&tr->ksym.lnode);
 104         bpf_trampoline_ksym_add(tr);
 105 out:
 106         mutex_unlock(&trampoline_mutex);
 107         return tr;
 108 }
 109
 110 static int is_ftrace_location(void *ip)
 111 {
 112         long addr;
 113
 114         addr = ftrace_location((long)ip);
 115         if (!addr)
 116                 return 0;
 117         if (WARN_ON_ONCE(addr != (long)ip))
 118                 return -EFAULT;
 119         return 1;
 120 }
 121
 122 static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 123 {
 124         void *ip = tr->func.addr;
 125         int ret;
 126
 127         if (tr->func.ftrace_managed)
 128                 ret = unregister_ftrace_direct((long)ip, (long)old_addr);
 129         else
 130                 ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
 131         return ret;
 132 }
 133
 134 static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr)
 135 {
 136         void *ip = tr->func.addr;
 137         int ret;
 138
 139         if (tr->func.ftrace_managed)
 140                 ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr);
 141         else
 142                 ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
 143         return ret;
 144 }
 145
 146 /* first time registering */
 147 static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 148 {
 149         void *ip = tr->func.addr;
 150         int ret;
 151
 152         ret = is_ftrace_location(ip);
 153         if (ret < 0)
 154                 return ret;
 155         tr->func.ftrace_managed = ret;
 156
 157         if (tr->func.ftrace_managed)
 158                 ret = register_ftrace_direct((long)ip, (long)new_addr);
 159         else
 160                 ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
 161         return ret;
 162 }
 163
 164 static struct bpf_tramp_progs *
 165 bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
 166 {
 167         const struct bpf_prog_aux *aux;
 168         struct bpf_tramp_progs *tprogs;
 169         struct bpf_prog **progs;
 170         int kind;
 171
 172         *total = 0;
 173         tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
 174         if (!tprogs)
 175                 return ERR_PTR(-ENOMEM);
 176
 177         for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
 178                 tprogs[kind].nr_progs = tr->progs_cnt[kind];
 179                 *total += tr->progs_cnt[kind];
 180                 progs = tprogs[kind].progs;
 181
 182                 hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist)
 183                         *progs++ = aux->prog;
 184         }
 185         return tprogs;
 186 }
 187
 188 static int bpf_trampoline_update(struct bpf_trampoline *tr)
 189 {
 190         void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2;
 191         void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2;
 192         struct bpf_tramp_progs *tprogs;
 193         u32 flags = BPF_TRAMP_F_RESTORE_REGS;
 194         int err, total;
 195
 196         tprogs = bpf_trampoline_get_progs(tr, &total);
 197         if (IS_ERR(tprogs))
 198                 return PTR_ERR(tprogs);
 199
 200         if (total == 0) {
 201                 err = unregister_fentry(tr, old_image);
 202                 tr->selector = 0;
 203                 goto out;
 204         }
 205
 206         if (tprogs[BPF_TRAMP_FEXIT].nr_progs ||
 207             tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
 208                 flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
 209
 210         /* Though the second half of trampoline page is unused a task could be
 211          * preempted in the middle of the first half of trampoline and two
 212          * updates to trampoline would change the code from underneath the
 213          * preempted task. Hence wait for tasks to voluntarily schedule or go
 214          * to userspace.
 215          * The same trampoline can hold both sleepable and non-sleepable progs.
 216          * synchronize_rcu_tasks_trace() is needed to make sure all sleepable
 217          * programs finish executing.
 218          * Wait for these two grace periods together.
 219          */
 220         synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace);
 221
 222         err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2,
 223                                           &tr->func.model, flags, tprogs,
 224                                           tr->func.addr);
 225         if (err < 0)
 226                 goto out;
 227
 228         if (tr->selector)
 229                 /* progs already running at this address */
 230                 err = modify_fentry(tr, old_image, new_image);
 231         else
 232                 /* first time registering */
 233                 err = register_fentry(tr, new_image);
 234         if (err)
 235                 goto out;
 236         tr->selector++;
 237 out:
 238         kfree(tprogs);
 239         return err;
 240 }
 241
 242 static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
 243 {
 244         switch (prog->expected_attach_type) {
 245         case BPF_TRACE_FENTRY:
 246                 return BPF_TRAMP_FENTRY;
 247         case BPF_MODIFY_RETURN:
 248                 return BPF_TRAMP_MODIFY_RETURN;
 249         case BPF_TRACE_FEXIT:
 250                 return BPF_TRAMP_FEXIT;
 251         case BPF_LSM_MAC:
 252                 if (!prog->aux->attach_func_proto->type)
 253                         /* The function returns void, we cannot modify its
 254                          * return value.
 255                          */
 256                         return BPF_TRAMP_FEXIT;
 257                 else
 258                         return BPF_TRAMP_MODIFY_RETURN;
 259         default:
 260                 return BPF_TRAMP_REPLACE;
 261         }
 262 }
 263
 264 int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
 265 {
 266         enum bpf_tramp_prog_type kind;
 267         int err = 0;
 268         int cnt;
 269
 270         kind = bpf_attach_type_to_tramp(prog);
 271         mutex_lock(&tr->mutex);
 272         if (tr->extension_prog) {
 273                 /* cannot attach fentry/fexit if extension prog is attached.
 274                  * cannot overwrite extension prog either.
 275                  */
 276                 err = -EBUSY;
 277                 goto out;
 278         }
 279         cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT];
 280         if (kind == BPF_TRAMP_REPLACE) {
 281                 /* Cannot attach extension if fentry/fexit are in use. */
 282                 if (cnt) {
 283                         err = -EBUSY;
 284                         goto out;
 285                 }
 286                 tr->extension_prog = prog;
 287                 err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
 288                                          prog->bpf_func);
 289                 goto out;
 290         }
 291         if (cnt >= BPF_MAX_TRAMP_PROGS) {
 292                 err = -E2BIG;
 293                 goto out;
 294         }
 295         if (!hlist_unhashed(&prog->aux->tramp_hlist)) {
 296                 /* prog already linked */
 297                 err = -EBUSY;
 298                 goto out;
 299         }
 300         hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]);
 301         tr->progs_cnt[kind]++;
 302         err = bpf_trampoline_update(tr);
 303         if (err) {
 304                 hlist_del(&prog->aux->tramp_hlist);
 305                 tr->progs_cnt[kind]--;
 306         }
 307 out:
 308         mutex_unlock(&tr->mutex);
 309         return err;
 310 }
 311
 312 /* bpf_trampoline_unlink_prog() should never fail. */
 313 int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
 314 {
 315         enum bpf_tramp_prog_type kind;
 316         int err;
 317
 318         kind = bpf_attach_type_to_tramp(prog);
 319         mutex_lock(&tr->mutex);
 320         if (kind == BPF_TRAMP_REPLACE) {
 321                 WARN_ON_ONCE(!tr->extension_prog);
 322                 err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
 323                                          tr->extension_prog->bpf_func, NULL);
 324                 tr->extension_prog = NULL;
 325                 goto out;
 326         }
 327         hlist_del(&prog->aux->tramp_hlist);
 328         tr->progs_cnt[kind]--;
 329         err = bpf_trampoline_update(tr);
 330 out:
 331         mutex_unlock(&tr->mutex);
 332         return err;
 333 }
 334
 335 struct bpf_trampoline *bpf_trampoline_get(u64 key,
 336                                           struct bpf_attach_target_info *tgt_info)
 337 {
 338         struct bpf_trampoline *tr;
 339
 340         tr = bpf_trampoline_lookup(key);
 341         if (!tr)
 342                 return NULL;
 343
 344         mutex_lock(&tr->mutex);
 345         if (tr->func.addr)
 346                 goto out;
 347
 348         memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel));
 349         tr->func.addr = (void *)tgt_info->tgt_addr;
 350 out:
 351         mutex_unlock(&tr->mutex);
 352         return tr;
 353 }
 354
 355 void bpf_trampoline_put(struct bpf_trampoline *tr)
 356 {
 357         if (!tr)
 358                 return;
 359         mutex_lock(&trampoline_mutex);
 360         if (!refcount_dec_and_test(&tr->refcnt))
 361                 goto out;
 362         WARN_ON_ONCE(mutex_is_locked(&tr->mutex));
 363         if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY])))
 364                 goto out;
 365         if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
 366                 goto out;
 367         bpf_image_ksym_del(&tr->ksym);
 368         /* This code will be executed when all bpf progs (both sleepable and
 369          * non-sleepable) went through
 370          * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred().
 371          * Hence no need for another synchronize_rcu_tasks_trace() here,
 372          * but synchronize_rcu_tasks() is still needed, since trampoline
 373          * may not have had any sleepable programs and we need to wait
 374          * for tasks to get out of trampoline code before freeing it.
 375          */
 376         synchronize_rcu_tasks();
 377         bpf_jit_free_exec(tr->image);
 378         hlist_del(&tr->hlist);
 379         kfree(tr);
 380 out:
 381         mutex_unlock(&trampoline_mutex);
 382 }
 383
 384 #define NO_START_TIME 1
 385 static u64 notrace bpf_prog_start_time(void)
 386 {
 387         u64 start = NO_START_TIME;
 388
 389         if (static_branch_unlikely(&bpf_stats_enabled_key)) {
 390                 start = sched_clock();
 391                 if (unlikely(!start))
 392                         start = NO_START_TIME;
 393         }
 394         return start;
 395 }
 396
 397 static void notrace inc_misses_counter(struct bpf_prog *prog)
 398 {
 399         struct bpf_prog_stats *stats;
 400
 401         stats = this_cpu_ptr(prog->stats);
 402         u64_stats_update_begin(&stats->syncp);
 403         stats->misses++;
 404         u64_stats_update_end(&stats->syncp);
 405 }
 406
 407 /* The logic is similar to BPF_PROG_RUN, but with an explicit
 408  * rcu_read_lock() and migrate_disable() which are required
 409  * for the trampoline. The macro is split into
 410  * call __bpf_prog_enter
 411  * call prog->bpf_func
 412  * call __bpf_prog_exit
 413  *
 414  * __bpf_prog_enter returns:
 415  * 0 - skip execution of the bpf prog
 416  * 1 - execute bpf prog
 417  * [2..MAX_U64] - excute bpf prog and record execution time.
 418  *     This is start time.
 419  */
 420 u64 notrace __bpf_prog_enter(struct bpf_prog *prog)
 421         __acquires(RCU)
 422 {
 423         rcu_read_lock();
 424         migrate_disable();
 425         if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
 426                 inc_misses_counter(prog);
 427                 return 0;
 428         }
 429         return bpf_prog_start_time();
 430 }
 431
 432 static void notrace update_prog_stats(struct bpf_prog *prog,
 433                                       u64 start)
 434 {
 435         struct bpf_prog_stats *stats;
 436
 437         if (static_branch_unlikely(&bpf_stats_enabled_key) &&
 438             /* static_key could be enabled in __bpf_prog_enter*
 439              * and disabled in __bpf_prog_exit*.
 440              * And vice versa.
 441              * Hence check that 'start' is valid.
 442              */
 443             start > NO_START_TIME) {
 444                 stats = this_cpu_ptr(prog->stats);
 445                 u64_stats_update_begin(&stats->syncp);
 446                 stats->cnt++;
 447                 stats->nsecs += sched_clock() - start;
 448                 u64_stats_update_end(&stats->syncp);
 449         }
 450 }
 451
 452 void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
 453         __releases(RCU)
 454 {
 455         update_prog_stats(prog, start);
 456         __this_cpu_dec(*(prog->active));
 457         migrate_enable();
 458         rcu_read_unlock();
 459 }
 460
 461 u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog)
 462 {
 463         rcu_read_lock_trace();
 464         migrate_disable();
 465         might_fault();
 466         if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) {
 467                 inc_misses_counter(prog);
 468                 return 0;
 469         }
 470         return bpf_prog_start_time();
 471 }
 472
 473 void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start)
 474 {
 475         update_prog_stats(prog, start);
 476         __this_cpu_dec(*(prog->active));
 477         migrate_enable();
 478         rcu_read_unlock_trace();
 479 }
 480
 481 int __weak
 482 arch_prepare_bpf_trampoline(void *image, void *image_end,
 483                             const struct btf_func_model *m, u32 flags,
 484                             struct bpf_tramp_progs *tprogs,
 485                             void *orig_call)
 486 {
 487         return -ENOTSUPP;
 488 }
 489
 490 static int __init init_trampolines(void)
 491 {
 492         int i;
 493
 494         for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
 495                 INIT_HLIST_HEAD(&trampoline_table[i]);
 496         return 0;
 497 }
 498 late_initcall(init_trampolines);