bpf, x86: Support private stack in jit
authorYonghong Song <yonghong.song@linux.dev>
Tue, 12 Nov 2024 16:39:22 +0000 (08:39 -0800)
committerAlexei Starovoitov <ast@kernel.org>
Wed, 13 Nov 2024 00:26:24 +0000 (16:26 -0800)
Private stack is allocated in function bpf_int_jit_compile() with
alignment 8. Private stack allocation size includes the stack size
determined by verifier and additional space to protect stack overflow
and underflow. See below an illustration:
  ---> memory address increasing
  [8 bytes to protect overflow] [normal stack] [8 bytes to protect underflow]
If overflow/underflow is detected, kernel messages will be
emited in dmesg like
  BPF private stack overflow/underflow detected for prog Fx
  BPF Private stack overflow/underflow detected for prog bpf_prog_a41699c234a1567a_subprog1x
Those messages are generated when I made some changes to jitted code
to intentially cause overflow for some progs.

For the jited prog, The x86 register 9 (X86_REG_R9) is used to replace
bpf frame register (BPF_REG_10). The private stack is used per
subprog per cpu. The X86_REG_R9 is saved and restored around every
func call (not including tailcall) to maintain correctness of
X86_REG_R9.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20241112163922.2224385-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
arch/x86/net/bpf_jit_comp.c
include/linux/bpf.h

index 3ff638c..8f896c3 100644 (file)
@@ -325,6 +325,22 @@ struct jit_context {
 /* Number of bytes that will be skipped on tailcall */
 #define X86_TAIL_CALL_OFFSET   (12 + ENDBR_INSN_SIZE)
 
+static void push_r9(u8 **pprog)
+{
+       u8 *prog = *pprog;
+
+       EMIT2(0x41, 0x51);   /* push r9 */
+       *pprog = prog;
+}
+
+static void pop_r9(u8 **pprog)
+{
+       u8 *prog = *pprog;
+
+       EMIT2(0x41, 0x59);   /* pop r9 */
+       *pprog = prog;
+}
+
 static void push_r12(u8 **pprog)
 {
        u8 *prog = *pprog;
@@ -1404,6 +1420,24 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
        *pprog = prog;
 }
 
+static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
+{
+       u8 *prog = *pprog;
+
+       /* movabs r9, priv_frame_ptr */
+       emit_mov_imm64(&prog, X86_REG_R9, (__force long) priv_frame_ptr >> 32,
+                      (u32) (__force long) priv_frame_ptr);
+
+#ifdef CONFIG_SMP
+       /* add <r9>, gs:[<off>] */
+       EMIT2(0x65, 0x4c);
+       EMIT3(0x03, 0x0c, 0x25);
+       EMIT((u32)(unsigned long)&this_cpu_off, 4);
+#endif
+
+       *pprog = prog;
+}
+
 #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
 
 #define __LOAD_TCC_PTR(off)                    \
@@ -1412,6 +1446,10 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
 #define LOAD_TAIL_CALL_CNT_PTR(stack)                          \
        __LOAD_TCC_PTR(BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack))
 
+/* Memory size/value to protect private stack overflow/underflow */
+#define PRIV_STACK_GUARD_SZ    8
+#define PRIV_STACK_GUARD_VAL   0xEB9F12345678eb9fULL
+
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
                  int oldproglen, struct jit_context *ctx, bool jmp_padding)
 {
@@ -1421,7 +1459,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
        int insn_cnt = bpf_prog->len;
        bool seen_exit = false;
        u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+       void __percpu *priv_frame_ptr = NULL;
        u64 arena_vm_start, user_vm_start;
+       void __percpu *priv_stack_ptr;
        int i, excnt = 0;
        int ilen, proglen = 0;
        u8 *prog = temp;
@@ -1429,6 +1469,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
        int err;
 
        stack_depth = bpf_prog->aux->stack_depth;
+       priv_stack_ptr = bpf_prog->aux->priv_stack_ptr;
+       if (priv_stack_ptr) {
+               priv_frame_ptr = priv_stack_ptr + PRIV_STACK_GUARD_SZ + round_up(stack_depth, 8);
+               stack_depth = 0;
+       }
 
        arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
        user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
@@ -1457,6 +1502,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
                emit_mov_imm64(&prog, X86_REG_R12,
                               arena_vm_start >> 32, (u32) arena_vm_start);
 
+       if (priv_frame_ptr)
+               emit_priv_frame_ptr(&prog, priv_frame_ptr);
+
        ilen = prog - temp;
        if (rw_image)
                memcpy(rw_image + proglen, temp, ilen);
@@ -1476,6 +1524,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
                u8 *func;
                int nops;
 
+               if (priv_frame_ptr) {
+                       if (src_reg == BPF_REG_FP)
+                               src_reg = X86_REG_R9;
+
+                       if (dst_reg == BPF_REG_FP)
+                               dst_reg = X86_REG_R9;
+               }
+
                switch (insn->code) {
                        /* ALU */
                case BPF_ALU | BPF_ADD | BPF_X:
@@ -2136,9 +2192,15 @@ populate_extable:
                        }
                        if (!imm32)
                                return -EINVAL;
+                       if (priv_frame_ptr) {
+                               push_r9(&prog);
+                               ip += 2;
+                       }
                        ip += x86_call_depth_emit_accounting(&prog, func, ip);
                        if (emit_call(&prog, func, ip))
                                return -EINVAL;
+                       if (priv_frame_ptr)
+                               pop_r9(&prog);
                        break;
                }
 
@@ -3306,6 +3368,42 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
        return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf);
 }
 
+static const char *bpf_get_prog_name(struct bpf_prog *prog)
+{
+       if (prog->aux->ksym.prog)
+               return prog->aux->ksym.name;
+       return prog->aux->name;
+}
+
+static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int alloc_size)
+{
+       int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
+       u64 *stack_ptr;
+
+       for_each_possible_cpu(cpu) {
+               stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+               stack_ptr[0] = PRIV_STACK_GUARD_VAL;
+               stack_ptr[underflow_idx] = PRIV_STACK_GUARD_VAL;
+       }
+}
+
+static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int alloc_size,
+                                  struct bpf_prog *prog)
+{
+       int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
+       u64 *stack_ptr;
+
+       for_each_possible_cpu(cpu) {
+               stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+               if (stack_ptr[0] != PRIV_STACK_GUARD_VAL ||
+                   stack_ptr[underflow_idx] != PRIV_STACK_GUARD_VAL) {
+                       pr_err("BPF private stack overflow/underflow detected for prog %sx\n",
+                              bpf_get_prog_name(prog));
+                       break;
+               }
+       }
+}
+
 struct x64_jit_data {
        struct bpf_binary_header *rw_header;
        struct bpf_binary_header *header;
@@ -3323,7 +3421,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
        struct bpf_binary_header *rw_header = NULL;
        struct bpf_binary_header *header = NULL;
        struct bpf_prog *tmp, *orig_prog = prog;
+       void __percpu *priv_stack_ptr = NULL;
        struct x64_jit_data *jit_data;
+       int priv_stack_alloc_sz;
        int proglen, oldproglen = 0;
        struct jit_context ctx = {};
        bool tmp_blinded = false;
@@ -3359,6 +3459,23 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
                }
                prog->aux->jit_data = jit_data;
        }
+       priv_stack_ptr = prog->aux->priv_stack_ptr;
+       if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
+               /* Allocate actual private stack size with verifier-calculated
+                * stack size plus two memory guards to protect overflow and
+                * underflow.
+                */
+               priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 8) +
+                                     2 * PRIV_STACK_GUARD_SZ;
+               priv_stack_ptr = __alloc_percpu_gfp(priv_stack_alloc_sz, 8, GFP_KERNEL);
+               if (!priv_stack_ptr) {
+                       prog = orig_prog;
+                       goto out_priv_stack;
+               }
+
+               priv_stack_init_guard(priv_stack_ptr, priv_stack_alloc_sz);
+               prog->aux->priv_stack_ptr = priv_stack_ptr;
+       }
        addrs = jit_data->addrs;
        if (addrs) {
                ctx = jit_data->ctx;
@@ -3494,6 +3611,11 @@ out_image:
                        bpf_prog_fill_jited_linfo(prog, addrs + 1);
 out_addrs:
                kvfree(addrs);
+               if (!image && priv_stack_ptr) {
+                       free_percpu(priv_stack_ptr);
+                       prog->aux->priv_stack_ptr = NULL;
+               }
+out_priv_stack:
                kfree(jit_data);
                prog->aux->jit_data = NULL;
        }
@@ -3532,6 +3654,8 @@ void bpf_jit_free(struct bpf_prog *prog)
        if (prog->jited) {
                struct x64_jit_data *jit_data = prog->aux->jit_data;
                struct bpf_binary_header *hdr;
+               void __percpu *priv_stack_ptr;
+               int priv_stack_alloc_sz;
 
                /*
                 * If we fail the final pass of JIT (from jit_subprogs),
@@ -3547,6 +3671,13 @@ void bpf_jit_free(struct bpf_prog *prog)
                prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
                hdr = bpf_jit_binary_pack_hdr(prog);
                bpf_jit_binary_pack_free(hdr, NULL);
+               priv_stack_ptr = prog->aux->priv_stack_ptr;
+               if (priv_stack_ptr) {
+                       priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 8) +
+                                             2 * PRIV_STACK_GUARD_SZ;
+                       priv_stack_check_guard(priv_stack_ptr, priv_stack_alloc_sz, prog);
+                       free_percpu(prog->aux->priv_stack_ptr);
+               }
                WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
        }
 
@@ -3562,6 +3693,11 @@ bool bpf_jit_supports_exceptions(void)
        return IS_ENABLED(CONFIG_UNWINDER_ORC);
 }
 
+bool bpf_jit_supports_private_stack(void)
+{
+       return true;
+}
+
 void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
 {
 #if defined(CONFIG_UNWINDER_ORC)
index 129b29e..d32cc37 100644 (file)
@@ -1507,6 +1507,7 @@ struct bpf_prog_aux {
        u32 max_rdwr_access;
        struct btf *attach_btf;
        const struct bpf_ctx_arg_aux *ctx_arg_info;
+       void __percpu *priv_stack_ptr;
        struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */
        struct bpf_prog *dst_prog;
        struct bpf_trampoline *dst_trampoline;