2 * Just-In-Time compiler for eBPF filters on 32bit ARM
4 * Copyright (c) 2017 Shubham Bansal <illusionist.neo@gmail.com>
5 * Copyright (c) 2011 Mircea Gherzan <mgherzan@gmail.com>
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License as published by the
9 * Free Software Foundation; version 2 of the License.
12 #include <linux/bpf.h>
13 #include <linux/bitops.h>
14 #include <linux/compiler.h>
15 #include <linux/errno.h>
16 #include <linux/filter.h>
17 #include <linux/netdevice.h>
18 #include <linux/string.h>
19 #include <linux/slab.h>
20 #include <linux/if_vlan.h>
22 #include <asm/cacheflush.h>
23 #include <asm/hwcap.h>
24 #include <asm/opcodes.h>
26 #include "bpf_jit_32.h"
29 * eBPF prog stack layout:
32 * original ARM_SP => +-----+
33 * | | callee saved registers
34 * +-----+ <= (BPF_FP + SCRATCH_SIZE)
35 * | ... | eBPF JIT scratch space
36 * eBPF fp register => +-----+
37 * (BPF_FP) | ... | eBPF prog stack
39 * |RSVD | JIT scratchpad
40 * current ARM_SP => +-----+ <= (BPF_FP - STACK_SIZE + SCRATCH_SIZE)
42 * | ... | Function call stack
47 * The callee saved registers depends on whether frame pointers are enabled.
48 * With frame pointers (to be compliant with the ABI):
51 * original ARM_SP => +------------------+ \
53 * current ARM_FP => +------------------+ } callee saved registers
54 * |r4-r8,r10,fp,ip,lr| |
55 * +------------------+ /
58 * Without frame pointers:
61 * original ARM_SP => +------------------+
62 * | r4-r8,r10,fp,lr | callee saved registers
63 * current ARM_FP => +------------------+
66 * When popping registers off the stack at the end of a BPF function, we
67 * reference them via the current ARM_FP register.
69 #define CALLEE_MASK (1 << ARM_R4 | 1 << ARM_R5 | 1 << ARM_R6 | \
70 1 << ARM_R7 | 1 << ARM_R8 | 1 << ARM_R10 | \
72 #define CALLEE_PUSH_MASK (CALLEE_MASK | 1 << ARM_LR)
73 #define CALLEE_POP_MASK (CALLEE_MASK | 1 << ARM_PC)
76 /* Stack layout - these are offsets from (top of stack - 4) */
97 /* Stack space for BPF_REG_2, BPF_REG_3, BPF_REG_4,
98 * BPF_REG_5, BPF_REG_7, BPF_REG_8, BPF_REG_9,
99 * BPF_REG_FP and Tail call counts.
101 BPF_JIT_SCRATCH_REGS,
105 * Negative "register" values indicate the register is stored on the stack
106 * and are the offset from the top of the eBPF JIT scratch space.
108 #define STACK_OFFSET(k) (-4 - (k) * 4)
109 #define SCRATCH_SIZE (BPF_JIT_SCRATCH_REGS * 4)
111 #define TMP_REG_1 (MAX_BPF_JIT_REG + 0) /* TEMP Register 1 */
112 #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) /* TEMP Register 2 */
113 #define TCALL_CNT (MAX_BPF_JIT_REG + 2) /* Tail Call Count */
115 #define FLAG_IMM_OVERFLOW (1 << 0)
118 * Map eBPF registers to ARM 32bit registers or stack scratch space.
120 * 1. First argument is passed using the arm 32bit registers and rest of the
121 * arguments are passed on stack scratch space.
122 * 2. First callee-saved argument is mapped to arm 32 bit registers and rest
123 * arguments are mapped to scratch space on stack.
124 * 3. We need two 64 bit temp registers to do complex operations on eBPF
127 * As the eBPF registers are all 64 bit registers and arm has only 32 bit
128 * registers, we have to map each eBPF registers with two arm 32 bit regs or
129 * scratch memory space and we have to build eBPF 64 bit register from those.
132 static const s8 bpf2a32[][2] = {
133 /* return value from in-kernel function, and exit value from eBPF */
134 [BPF_REG_0] = {ARM_R1, ARM_R0},
135 /* arguments from eBPF program to in-kernel function */
136 [BPF_REG_1] = {ARM_R3, ARM_R2},
137 /* Stored on stack scratch space */
138 [BPF_REG_2] = {STACK_OFFSET(BPF_R2_HI), STACK_OFFSET(BPF_R2_LO)},
139 [BPF_REG_3] = {STACK_OFFSET(BPF_R3_HI), STACK_OFFSET(BPF_R3_LO)},
140 [BPF_REG_4] = {STACK_OFFSET(BPF_R4_HI), STACK_OFFSET(BPF_R4_LO)},
141 [BPF_REG_5] = {STACK_OFFSET(BPF_R5_HI), STACK_OFFSET(BPF_R5_LO)},
142 /* callee saved registers that in-kernel function will preserve */
143 [BPF_REG_6] = {ARM_R5, ARM_R4},
144 /* Stored on stack scratch space */
145 [BPF_REG_7] = {STACK_OFFSET(BPF_R7_HI), STACK_OFFSET(BPF_R7_LO)},
146 [BPF_REG_8] = {STACK_OFFSET(BPF_R8_HI), STACK_OFFSET(BPF_R8_LO)},
147 [BPF_REG_9] = {STACK_OFFSET(BPF_R9_HI), STACK_OFFSET(BPF_R9_LO)},
148 /* Read only Frame Pointer to access Stack */
149 [BPF_REG_FP] = {STACK_OFFSET(BPF_FP_HI), STACK_OFFSET(BPF_FP_LO)},
150 /* Temporary Register for internal BPF JIT, can be used
151 * for constant blindings and others.
153 [TMP_REG_1] = {ARM_R7, ARM_R6},
154 [TMP_REG_2] = {ARM_R10, ARM_R8},
155 /* Tail call count. Stored on stack scratch space. */
156 [TCALL_CNT] = {STACK_OFFSET(BPF_TC_HI), STACK_OFFSET(BPF_TC_LO)},
157 /* temporary register for blinding constants.
158 * Stored on stack scratch space.
160 [BPF_REG_AX] = {STACK_OFFSET(BPF_AX_HI), STACK_OFFSET(BPF_AX_LO)},
163 #define dst_lo dst[1]
164 #define dst_hi dst[0]
165 #define src_lo src[1]
166 #define src_hi src[0]
172 * idx : index of current last JITed instruction.
173 * prologue_bytes : bytes used in prologue.
174 * epilogue_offset : offset of epilogue starting.
175 * offsets : array of eBPF instruction offsets in
177 * target : final JITed code.
178 * epilogue_bytes : no of bytes used in epilogue.
179 * imm_count : no of immediate counts used for global
181 * imms : array of global variable addresses.
185 const struct bpf_prog *prog;
187 unsigned int prologue_bytes;
188 unsigned int epilogue_offset;
193 #if __LINUX_ARM_ARCH__ < 7
201 * Wrappers which handle both OABI and EABI and assures Thumb2 interworking
202 * (where the assembly routines like __aeabi_uidiv could cause problems).
204 static u32 jit_udiv32(u32 dividend, u32 divisor)
206 return dividend / divisor;
209 static u32 jit_mod32(u32 dividend, u32 divisor)
211 return dividend % divisor;
214 static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx)
216 inst |= (cond << 28);
217 inst = __opcode_to_mem_arm(inst);
219 if (ctx->target != NULL)
220 ctx->target[ctx->idx] = inst;
226 * Emit an instruction that will be executed unconditionally.
228 static inline void emit(u32 inst, struct jit_ctx *ctx)
230 _emit(ARM_COND_AL, inst, ctx);
234 * Checks if immediate value can be converted to imm12(12 bits) value.
236 static int16_t imm8m(u32 x)
240 for (rot = 0; rot < 16; rot++)
241 if ((x & ~ror32(0xff, 2 * rot)) == 0)
242 return rol32(x, 2 * rot) | (rot << 8);
246 static u32 arm_bpf_ldst_imm12(u32 op, u8 rt, u8 rn, s16 imm12)
248 op |= rt << 12 | rn << 16;
250 op |= ARM_INST_LDST__U;
253 return op | (imm12 & 0xfff);
256 static u32 arm_bpf_ldst_imm8(u32 op, u8 rt, u8 rn, s16 imm8)
258 op |= rt << 12 | rn << 16;
260 op |= ARM_INST_LDST__U;
263 return op | (imm8 & 0xf0) << 4 | (imm8 & 0x0f);
266 #define ARM_LDR_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_LDR_I, rt, rn, off)
267 #define ARM_LDRB_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_LDRB_I, rt, rn, off)
268 #define ARM_LDRH_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRH_I, rt, rn, off)
270 #define ARM_STR_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_STR_I, rt, rn, off)
271 #define ARM_STRB_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_STRB_I, rt, rn, off)
272 #define ARM_STRH_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_STRH_I, rt, rn, off)
275 * Initializes the JIT space with undefined instructions.
277 static void jit_fill_hole(void *area, unsigned int size)
280 /* We are guaranteed to have aligned memory. */
281 for (ptr = area; size >= sizeof(u32); size -= sizeof(u32))
282 *ptr++ = __opcode_to_mem_arm(ARM_INST_UDF);
285 #if defined(CONFIG_AEABI) && (__LINUX_ARM_ARCH__ >= 5)
286 /* EABI requires the stack to be aligned to 64-bit boundaries */
287 #define STACK_ALIGNMENT 8
289 /* Stack must be aligned to 32-bit boundaries */
290 #define STACK_ALIGNMENT 4
293 /* total stack size used in JITed code */
294 #define _STACK_SIZE (ctx->prog->aux->stack_depth + SCRATCH_SIZE)
295 #define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
297 /* Get the offset of eBPF REGISTERs stored on scratch space. */
298 #define STACK_VAR(off) (STACK_SIZE + (off))
300 #if __LINUX_ARM_ARCH__ < 7
302 static u16 imm_offset(u32 k, struct jit_ctx *ctx)
304 unsigned int i = 0, offset;
307 /* on the "fake" run we just count them (duplicates included) */
308 if (ctx->target == NULL) {
313 while ((i < ctx->imm_count) && ctx->imms[i]) {
314 if (ctx->imms[i] == k)
319 if (ctx->imms[i] == 0)
322 /* constants go just after the epilogue */
323 offset = ctx->offsets[ctx->prog->len - 1] * 4;
324 offset += ctx->prologue_bytes;
325 offset += ctx->epilogue_bytes;
328 ctx->target[offset / 4] = k;
330 /* PC in ARM mode == address of the instruction + 8 */
331 imm = offset - (8 + ctx->idx * 4);
335 * literal pool is too far, signal it into flags. we
336 * can only detect it on the second pass unfortunately.
338 ctx->flags |= FLAG_IMM_OVERFLOW;
345 #endif /* __LINUX_ARM_ARCH__ */
347 static inline int bpf2a32_offset(int bpf_to, int bpf_from,
348 const struct jit_ctx *ctx) {
351 if (ctx->target == NULL)
353 to = ctx->offsets[bpf_to];
354 from = ctx->offsets[bpf_from];
356 return to - from - 1;
360 * Move an immediate that's not an imm8m to a core register.
362 static inline void emit_mov_i_no8m(const u8 rd, u32 val, struct jit_ctx *ctx)
364 #if __LINUX_ARM_ARCH__ < 7
365 emit(ARM_LDR_I(rd, ARM_PC, imm_offset(val, ctx)), ctx);
367 emit(ARM_MOVW(rd, val & 0xffff), ctx);
369 emit(ARM_MOVT(rd, val >> 16), ctx);
373 static inline void emit_mov_i(const u8 rd, u32 val, struct jit_ctx *ctx)
375 int imm12 = imm8m(val);
378 emit(ARM_MOV_I(rd, imm12), ctx);
380 emit_mov_i_no8m(rd, val, ctx);
383 static void emit_bx_r(u8 tgt_reg, struct jit_ctx *ctx)
385 if (elf_hwcap & HWCAP_THUMB)
386 emit(ARM_BX(tgt_reg), ctx);
388 emit(ARM_MOV_R(ARM_PC, tgt_reg), ctx);
391 static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx)
393 #if __LINUX_ARM_ARCH__ < 5
394 emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx);
395 emit_bx_r(tgt_reg, ctx);
397 emit(ARM_BLX_R(tgt_reg), ctx);
401 static inline int epilogue_offset(const struct jit_ctx *ctx)
404 /* No need for 1st dummy run */
405 if (ctx->target == NULL)
407 to = ctx->epilogue_offset;
410 return to - from - 2;
413 static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op)
415 const s8 *tmp = bpf2a32[TMP_REG_1];
417 #if __LINUX_ARM_ARCH__ == 7
418 if (elf_hwcap & HWCAP_IDIVA) {
420 emit(ARM_UDIV(rd, rm, rn), ctx);
422 emit(ARM_UDIV(ARM_IP, rm, rn), ctx);
423 emit(ARM_MLS(rd, rn, ARM_IP, rm), ctx);
430 * For BPF_ALU | BPF_DIV | BPF_K instructions
431 * As ARM_R1 and ARM_R0 contains 1st argument of bpf
432 * function, we need to save it on caller side to save
433 * it from getting destroyed within callee.
434 * After the return from the callee, we restore ARM_R0
438 emit(ARM_MOV_R(tmp[0], ARM_R1), ctx);
439 emit(ARM_MOV_R(ARM_R1, rn), ctx);
442 emit(ARM_MOV_R(tmp[1], ARM_R0), ctx);
443 emit(ARM_MOV_R(ARM_R0, rm), ctx);
446 /* Call appropriate function */
447 emit_mov_i(ARM_IP, op == BPF_DIV ?
448 (u32)jit_udiv32 : (u32)jit_mod32, ctx);
449 emit_blx_r(ARM_IP, ctx);
451 /* Save return value */
453 emit(ARM_MOV_R(rd, ARM_R0), ctx);
455 /* Restore ARM_R0 and ARM_R1 */
457 emit(ARM_MOV_R(ARM_R1, tmp[0]), ctx);
459 emit(ARM_MOV_R(ARM_R0, tmp[1]), ctx);
462 /* Is the translated BPF register on stack? */
463 static bool is_stacked(s8 reg)
468 static inline void emit_a32_mov_i(const s8 dst, const u32 val,
471 const s8 *tmp = bpf2a32[TMP_REG_1];
473 if (is_stacked(dst)) {
474 emit_mov_i(tmp[1], val, ctx);
475 emit(ARM_STR_I(tmp[1], ARM_SP, STACK_VAR(dst)), ctx);
477 emit_mov_i(dst, val, ctx);
481 /* Sign extended move */
482 static inline void emit_a32_mov_i64(const bool is64, const s8 dst[],
483 const u32 val, struct jit_ctx *ctx) {
486 if (is64 && (val & (1<<31)))
488 emit_a32_mov_i(dst_lo, val, ctx);
489 emit_a32_mov_i(dst_hi, hi, ctx);
492 static inline void emit_a32_add_r(const u8 dst, const u8 src,
493 const bool is64, const bool hi,
494 struct jit_ctx *ctx) {
496 * adds dst_lo, dst_lo, src_lo
497 * adc dst_hi, dst_hi, src_hi
499 * add dst_lo, dst_lo, src_lo
502 emit(ARM_ADDS_R(dst, dst, src), ctx);
504 emit(ARM_ADC_R(dst, dst, src), ctx);
506 emit(ARM_ADD_R(dst, dst, src), ctx);
509 static inline void emit_a32_sub_r(const u8 dst, const u8 src,
510 const bool is64, const bool hi,
511 struct jit_ctx *ctx) {
513 * subs dst_lo, dst_lo, src_lo
514 * sbc dst_hi, dst_hi, src_hi
516 * sub dst_lo, dst_lo, src_lo
519 emit(ARM_SUBS_R(dst, dst, src), ctx);
521 emit(ARM_SBC_R(dst, dst, src), ctx);
523 emit(ARM_SUB_R(dst, dst, src), ctx);
526 static inline void emit_alu_r(const u8 dst, const u8 src, const bool is64,
527 const bool hi, const u8 op, struct jit_ctx *ctx){
528 switch (BPF_OP(op)) {
529 /* dst = dst + src */
531 emit_a32_add_r(dst, src, is64, hi, ctx);
533 /* dst = dst - src */
535 emit_a32_sub_r(dst, src, is64, hi, ctx);
537 /* dst = dst | src */
539 emit(ARM_ORR_R(dst, dst, src), ctx);
541 /* dst = dst & src */
543 emit(ARM_AND_R(dst, dst, src), ctx);
545 /* dst = dst ^ src */
547 emit(ARM_EOR_R(dst, dst, src), ctx);
549 /* dst = dst * src */
551 emit(ARM_MUL(dst, dst, src), ctx);
553 /* dst = dst << src */
555 emit(ARM_LSL_R(dst, dst, src), ctx);
557 /* dst = dst >> src */
559 emit(ARM_LSR_R(dst, dst, src), ctx);
561 /* dst = dst >> src (signed)*/
563 emit(ARM_MOV_SR(dst, dst, SRTYPE_ASR, src), ctx);
568 /* ALU operation (32 bit)
571 static inline void emit_a32_alu_r(const s8 dst, const s8 src,
572 struct jit_ctx *ctx, const bool is64,
573 const bool hi, const u8 op) {
574 const s8 *tmp = bpf2a32[TMP_REG_1];
575 s8 rn = is_stacked(src) ? tmp[1] : src;
578 emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src)), ctx);
581 if (is_stacked(dst)) {
582 emit(ARM_LDR_I(tmp[0], ARM_SP, STACK_VAR(dst)), ctx);
583 emit_alu_r(tmp[0], rn, is64, hi, op, ctx);
584 emit(ARM_STR_I(tmp[0], ARM_SP, STACK_VAR(dst)), ctx);
586 emit_alu_r(dst, rn, is64, hi, op, ctx);
590 /* ALU operation (64 bit) */
591 static inline void emit_a32_alu_r64(const bool is64, const s8 dst[],
592 const s8 src[], struct jit_ctx *ctx,
594 emit_a32_alu_r(dst_lo, src_lo, ctx, is64, false, op);
596 emit_a32_alu_r(dst_hi, src_hi, ctx, is64, true, op);
598 emit_a32_mov_i(dst_hi, 0, ctx);
601 /* dst = imm (4 bytes)*/
602 static inline void emit_a32_mov_r(const s8 dst, const s8 src,
603 struct jit_ctx *ctx) {
604 const s8 *tmp = bpf2a32[TMP_REG_1];
605 s8 rt = is_stacked(src) ? tmp[0] : src;
608 emit(ARM_LDR_I(tmp[0], ARM_SP, STACK_VAR(src)), ctx);
610 emit(ARM_STR_I(rt, ARM_SP, STACK_VAR(dst)), ctx);
612 emit(ARM_MOV_R(dst, rt), ctx);
616 static inline void emit_a32_mov_r64(const bool is64, const s8 dst[],
618 struct jit_ctx *ctx) {
619 emit_a32_mov_r(dst_lo, src_lo, ctx);
621 /* complete 8 byte move */
622 emit_a32_mov_r(dst_hi, src_hi, ctx);
624 /* Zero out high 4 bytes */
625 emit_a32_mov_i(dst_hi, 0, ctx);
629 /* Shift operations */
630 static inline void emit_a32_alu_i(const s8 dst, const u32 val,
631 struct jit_ctx *ctx, const u8 op) {
632 const s8 *tmp = bpf2a32[TMP_REG_1];
633 s8 rd = is_stacked(dst) ? tmp[0] : dst;
636 emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst)), ctx);
638 /* Do shift operation */
641 emit(ARM_LSL_I(rd, rd, val), ctx);
644 emit(ARM_LSR_I(rd, rd, val), ctx);
647 emit(ARM_RSB_I(rd, rd, val), ctx);
652 emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst)), ctx);
655 /* dst = ~dst (64 bit) */
656 static inline void emit_a32_neg64(const s8 dst[],
657 struct jit_ctx *ctx){
658 const s8 *tmp = bpf2a32[TMP_REG_1];
659 s8 rd = is_stacked(dst_lo) ? tmp[1] : dst[1];
660 s8 rm = is_stacked(dst_lo) ? tmp[0] : dst[0];
663 if (is_stacked(dst_lo)) {
664 emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx);
665 emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx);
668 /* Do Negate Operation */
669 emit(ARM_RSBS_I(rd, rd, 0), ctx);
670 emit(ARM_RSC_I(rm, rm, 0), ctx);
672 if (is_stacked(dst_lo)) {
673 emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx);
674 emit(ARM_STR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx);
678 /* dst = dst << src */
679 static inline void emit_a32_lsh_r64(const s8 dst[], const s8 src[],
680 struct jit_ctx *ctx) {
681 const s8 *tmp = bpf2a32[TMP_REG_1];
682 const s8 *tmp2 = bpf2a32[TMP_REG_2];
685 s8 rt = is_stacked(src_lo) ? tmp2[1] : src_lo;
686 s8 rd = is_stacked(dst_lo) ? tmp[1] : dst_lo;
687 s8 rm = is_stacked(dst_lo) ? tmp[0] : dst_hi;
689 if (is_stacked(src_lo))
690 emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(src_lo)), ctx);
691 if (is_stacked(dst_lo)) {
692 emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx);
693 emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx);
696 /* Do LSH operation */
697 emit(ARM_SUB_I(ARM_IP, rt, 32), ctx);
698 emit(ARM_RSB_I(tmp2[0], rt, 32), ctx);
699 emit(ARM_MOV_SR(ARM_LR, rm, SRTYPE_ASL, rt), ctx);
700 emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd, SRTYPE_ASL, ARM_IP), ctx);
701 emit(ARM_ORR_SR(ARM_IP, ARM_LR, rd, SRTYPE_LSR, tmp2[0]), ctx);
702 emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_ASL, rt), ctx);
704 if (is_stacked(dst_lo)) {
705 emit(ARM_STR_I(ARM_LR, ARM_SP, STACK_VAR(dst_lo)), ctx);
706 emit(ARM_STR_I(ARM_IP, ARM_SP, STACK_VAR(dst_hi)), ctx);
708 emit(ARM_MOV_R(rd, ARM_LR), ctx);
709 emit(ARM_MOV_R(rm, ARM_IP), ctx);
713 /* dst = dst >> src (signed)*/
714 static inline void emit_a32_arsh_r64(const s8 dst[], const s8 src[],
715 struct jit_ctx *ctx) {
716 const s8 *tmp = bpf2a32[TMP_REG_1];
717 const s8 *tmp2 = bpf2a32[TMP_REG_2];
719 s8 rt = is_stacked(src_lo) ? tmp2[1] : src_lo;
720 s8 rd = is_stacked(dst_lo) ? tmp[1] : dst_lo;
721 s8 rm = is_stacked(dst_lo) ? tmp[0] : dst_hi;
723 if (is_stacked(src_lo))
724 emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(src_lo)), ctx);
725 if (is_stacked(dst_lo)) {
726 emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx);
727 emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx);
730 /* Do the ARSH operation */
731 emit(ARM_RSB_I(ARM_IP, rt, 32), ctx);
732 emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx);
733 emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_LSR, rt), ctx);
734 emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_ASL, ARM_IP), ctx);
735 _emit(ARM_COND_MI, ARM_B(0), ctx);
736 emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_ASR, tmp2[0]), ctx);
737 emit(ARM_MOV_SR(ARM_IP, rm, SRTYPE_ASR, rt), ctx);
738 if (is_stacked(dst_lo)) {
739 emit(ARM_STR_I(ARM_LR, ARM_SP, STACK_VAR(dst_lo)), ctx);
740 emit(ARM_STR_I(ARM_IP, ARM_SP, STACK_VAR(dst_hi)), ctx);
742 emit(ARM_MOV_R(rd, ARM_LR), ctx);
743 emit(ARM_MOV_R(rm, ARM_IP), ctx);
747 /* dst = dst >> src */
748 static inline void emit_a32_rsh_r64(const s8 dst[], const s8 src[],
749 struct jit_ctx *ctx) {
750 const s8 *tmp = bpf2a32[TMP_REG_1];
751 const s8 *tmp2 = bpf2a32[TMP_REG_2];
753 s8 rt = is_stacked(src_lo) ? tmp2[1] : src_lo;
754 s8 rd = is_stacked(dst_lo) ? tmp[1] : dst_lo;
755 s8 rm = is_stacked(dst_lo) ? tmp[0] : dst_hi;
757 if (is_stacked(src_lo))
758 emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(src_lo)), ctx);
759 if (is_stacked(dst_lo)) {
760 emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx);
761 emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx);
764 /* Do RSH operation */
765 emit(ARM_RSB_I(ARM_IP, rt, 32), ctx);
766 emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx);
767 emit(ARM_MOV_SR(ARM_LR, rd, SRTYPE_LSR, rt), ctx);
768 emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_ASL, ARM_IP), ctx);
769 emit(ARM_ORR_SR(ARM_LR, ARM_LR, rm, SRTYPE_LSR, tmp2[0]), ctx);
770 emit(ARM_MOV_SR(ARM_IP, rm, SRTYPE_LSR, rt), ctx);
771 if (is_stacked(dst_lo)) {
772 emit(ARM_STR_I(ARM_LR, ARM_SP, STACK_VAR(dst_lo)), ctx);
773 emit(ARM_STR_I(ARM_IP, ARM_SP, STACK_VAR(dst_hi)), ctx);
775 emit(ARM_MOV_R(rd, ARM_LR), ctx);
776 emit(ARM_MOV_R(rm, ARM_IP), ctx);
780 /* dst = dst << val */
781 static inline void emit_a32_lsh_i64(const s8 dst[],
782 const u32 val, struct jit_ctx *ctx){
783 const s8 *tmp = bpf2a32[TMP_REG_1];
784 const s8 *tmp2 = bpf2a32[TMP_REG_2];
786 s8 rd = is_stacked(dst_lo) ? tmp[1] : dst_lo;
787 s8 rm = is_stacked(dst_lo) ? tmp[0] : dst_hi;
789 if (is_stacked(dst_lo)) {
790 emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx);
791 emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx);
794 /* Do LSH operation */
796 emit(ARM_MOV_SI(tmp2[0], rm, SRTYPE_ASL, val), ctx);
797 emit(ARM_ORR_SI(rm, tmp2[0], rd, SRTYPE_LSR, 32 - val), ctx);
798 emit(ARM_MOV_SI(rd, rd, SRTYPE_ASL, val), ctx);
801 emit(ARM_MOV_R(rm, rd), ctx);
803 emit(ARM_MOV_SI(rm, rd, SRTYPE_ASL, val - 32), ctx);
804 emit(ARM_EOR_R(rd, rd, rd), ctx);
807 if (is_stacked(dst_lo)) {
808 emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx);
809 emit(ARM_STR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx);
813 /* dst = dst >> val */
814 static inline void emit_a32_rsh_i64(const s8 dst[],
815 const u32 val, struct jit_ctx *ctx) {
816 const s8 *tmp = bpf2a32[TMP_REG_1];
817 const s8 *tmp2 = bpf2a32[TMP_REG_2];
819 s8 rd = is_stacked(dst_lo) ? tmp[1] : dst_lo;
820 s8 rm = is_stacked(dst_lo) ? tmp[0] : dst_hi;
822 if (is_stacked(dst_lo)) {
823 emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx);
824 emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx);
827 /* Do LSR operation */
829 emit(ARM_MOV_SI(tmp2[1], rd, SRTYPE_LSR, val), ctx);
830 emit(ARM_ORR_SI(rd, tmp2[1], rm, SRTYPE_ASL, 32 - val), ctx);
831 emit(ARM_MOV_SI(rm, rm, SRTYPE_LSR, val), ctx);
832 } else if (val == 32) {
833 emit(ARM_MOV_R(rd, rm), ctx);
834 emit(ARM_MOV_I(rm, 0), ctx);
836 emit(ARM_MOV_SI(rd, rm, SRTYPE_LSR, val - 32), ctx);
837 emit(ARM_MOV_I(rm, 0), ctx);
840 if (is_stacked(dst_lo)) {
841 emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx);
842 emit(ARM_STR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx);
846 /* dst = dst >> val (signed) */
847 static inline void emit_a32_arsh_i64(const s8 dst[],
848 const u32 val, struct jit_ctx *ctx){
849 const s8 *tmp = bpf2a32[TMP_REG_1];
850 const s8 *tmp2 = bpf2a32[TMP_REG_2];
852 s8 rd = is_stacked(dst_lo) ? tmp[1] : dst_lo;
853 s8 rm = is_stacked(dst_lo) ? tmp[0] : dst_hi;
855 if (is_stacked(dst_lo)) {
856 emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx);
857 emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx);
860 /* Do ARSH operation */
862 emit(ARM_MOV_SI(tmp2[1], rd, SRTYPE_LSR, val), ctx);
863 emit(ARM_ORR_SI(rd, tmp2[1], rm, SRTYPE_ASL, 32 - val), ctx);
864 emit(ARM_MOV_SI(rm, rm, SRTYPE_ASR, val), ctx);
865 } else if (val == 32) {
866 emit(ARM_MOV_R(rd, rm), ctx);
867 emit(ARM_MOV_SI(rm, rm, SRTYPE_ASR, 31), ctx);
869 emit(ARM_MOV_SI(rd, rm, SRTYPE_ASR, val - 32), ctx);
870 emit(ARM_MOV_SI(rm, rm, SRTYPE_ASR, 31), ctx);
873 if (is_stacked(dst_lo)) {
874 emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx);
875 emit(ARM_STR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx);
879 static inline void emit_a32_mul_r64(const s8 dst[], const s8 src[],
880 struct jit_ctx *ctx) {
881 const s8 *tmp = bpf2a32[TMP_REG_1];
882 const s8 *tmp2 = bpf2a32[TMP_REG_2];
883 /* Setup operands for multiplication */
884 s8 rd = is_stacked(dst_lo) ? tmp[1] : dst_lo;
885 s8 rm = is_stacked(dst_lo) ? tmp[0] : dst_hi;
886 s8 rt = is_stacked(src_lo) ? tmp2[1] : src_lo;
887 s8 rn = is_stacked(src_lo) ? tmp2[0] : src_hi;
889 if (is_stacked(dst_lo)) {
890 emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx);
891 emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx);
893 if (is_stacked(src_lo)) {
894 emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(src_lo)), ctx);
895 emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_hi)), ctx);
898 /* Do Multiplication */
899 emit(ARM_MUL(ARM_IP, rd, rn), ctx);
900 emit(ARM_MUL(ARM_LR, rm, rt), ctx);
901 emit(ARM_ADD_R(ARM_LR, ARM_IP, ARM_LR), ctx);
903 emit(ARM_UMULL(ARM_IP, rm, rd, rt), ctx);
904 emit(ARM_ADD_R(rm, ARM_LR, rm), ctx);
905 if (is_stacked(dst_lo)) {
906 emit(ARM_STR_I(ARM_IP, ARM_SP, STACK_VAR(dst_lo)), ctx);
907 emit(ARM_STR_I(rm, ARM_SP, STACK_VAR(dst_hi)), ctx);
909 emit(ARM_MOV_R(rd, ARM_IP), ctx);
913 /* *(size *)(dst + off) = src */
914 static inline void emit_str_r(const s8 dst, const s8 src,
915 const s32 off, struct jit_ctx *ctx, const u8 sz){
916 const s8 *tmp = bpf2a32[TMP_REG_1];
917 s8 rd = is_stacked(dst) ? tmp[1] : dst;
920 emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst)), ctx);
922 emit_a32_mov_i(tmp[0], off, ctx);
923 emit(ARM_ADD_R(tmp[0], rd, tmp[0]), ctx);
929 emit(ARM_STR_I(src, rd, 0), ctx);
932 /* Store a HalfWord */
933 emit(ARM_STRH_I(src, rd, 0), ctx);
937 emit(ARM_STRB_I(src, rd, 0), ctx);
942 /* dst = *(size*)(src + off) */
943 static inline void emit_ldx_r(const s8 dst[], const s8 src,
944 s32 off, struct jit_ctx *ctx, const u8 sz){
945 const s8 *tmp = bpf2a32[TMP_REG_1];
946 const s8 *rd = is_stacked(dst_lo) ? tmp : dst;
955 if (off < 0 || off > off_max) {
956 emit_a32_mov_i(tmp[0], off, ctx);
957 emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx);
960 } else if (rd[1] == rm) {
961 emit(ARM_MOV_R(tmp[0], rm), ctx);
967 emit(ARM_LDRB_I(rd[1], rm, off), ctx);
968 emit_a32_mov_i(dst[0], 0, ctx);
971 /* Load a HalfWord */
972 emit(ARM_LDRH_I(rd[1], rm, off), ctx);
973 emit_a32_mov_i(dst[0], 0, ctx);
977 emit(ARM_LDR_I(rd[1], rm, off), ctx);
978 emit_a32_mov_i(dst[0], 0, ctx);
981 /* Load a Double Word */
982 emit(ARM_LDR_I(rd[1], rm, off), ctx);
983 emit(ARM_LDR_I(rd[0], rm, off + 4), ctx);
986 if (is_stacked(dst_lo))
987 emit(ARM_STR_I(rd[1], ARM_SP, STACK_VAR(dst_lo)), ctx);
988 if (is_stacked(dst_lo) && sz == BPF_DW)
989 emit(ARM_STR_I(rd[0], ARM_SP, STACK_VAR(dst_hi)), ctx);
992 /* Arithmatic Operation */
993 static inline void emit_ar_r(const u8 rd, const u8 rt, const u8 rm,
994 const u8 rn, struct jit_ctx *ctx, u8 op) {
997 emit(ARM_AND_R(ARM_IP, rt, rn), ctx);
998 emit(ARM_AND_R(ARM_LR, rd, rm), ctx);
999 emit(ARM_ORRS_R(ARM_IP, ARM_LR, ARM_IP), ctx);
1007 emit(ARM_CMP_R(rd, rm), ctx);
1008 _emit(ARM_COND_EQ, ARM_CMP_R(rt, rn), ctx);
1012 emit(ARM_CMP_R(rn, rt), ctx);
1013 emit(ARM_SBCS_R(ARM_IP, rm, rd), ctx);
1017 emit(ARM_CMP_R(rt, rn), ctx);
1018 emit(ARM_SBCS_R(ARM_IP, rd, rm), ctx);
1023 static int out_offset = -1; /* initialized on the first pass of build_body() */
1024 static int emit_bpf_tail_call(struct jit_ctx *ctx)
1027 /* bpf_tail_call(void *prog_ctx, struct bpf_array *array, u64 index) */
1028 const s8 *r2 = bpf2a32[BPF_REG_2];
1029 const s8 *r3 = bpf2a32[BPF_REG_3];
1030 const s8 *tmp = bpf2a32[TMP_REG_1];
1031 const s8 *tmp2 = bpf2a32[TMP_REG_2];
1032 const s8 *tcc = bpf2a32[TCALL_CNT];
1033 const int idx0 = ctx->idx;
1034 #define cur_offset (ctx->idx - idx0)
1035 #define jmp_offset (out_offset - (cur_offset) - 2)
1038 /* if (index >= array->map.max_entries)
1041 off = offsetof(struct bpf_array, map.max_entries);
1042 /* array->map.max_entries */
1043 emit_a32_mov_i(tmp[1], off, ctx);
1044 emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r2[1])), ctx);
1045 emit(ARM_LDR_R(tmp[1], tmp2[1], tmp[1]), ctx);
1046 /* index is 32-bit for arrays */
1047 emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r3[1])), ctx);
1048 /* index >= array->map.max_entries */
1049 emit(ARM_CMP_R(tmp2[1], tmp[1]), ctx);
1050 _emit(ARM_COND_CS, ARM_B(jmp_offset), ctx);
1052 /* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
1056 lo = (u32)MAX_TAIL_CALL_CNT;
1057 hi = (u32)((u64)MAX_TAIL_CALL_CNT >> 32);
1058 emit(ARM_LDR_I(tmp[1], ARM_SP, STACK_VAR(tcc[1])), ctx);
1059 emit(ARM_LDR_I(tmp[0], ARM_SP, STACK_VAR(tcc[0])), ctx);
1060 emit(ARM_CMP_I(tmp[0], hi), ctx);
1061 _emit(ARM_COND_EQ, ARM_CMP_I(tmp[1], lo), ctx);
1062 _emit(ARM_COND_HI, ARM_B(jmp_offset), ctx);
1063 emit(ARM_ADDS_I(tmp[1], tmp[1], 1), ctx);
1064 emit(ARM_ADC_I(tmp[0], tmp[0], 0), ctx);
1065 emit(ARM_STR_I(tmp[1], ARM_SP, STACK_VAR(tcc[1])), ctx);
1066 emit(ARM_STR_I(tmp[0], ARM_SP, STACK_VAR(tcc[0])), ctx);
1068 /* prog = array->ptrs[index]
1072 off = offsetof(struct bpf_array, ptrs);
1073 emit_a32_mov_i(tmp[1], off, ctx);
1074 emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r2[1])), ctx);
1075 emit(ARM_ADD_R(tmp[1], tmp2[1], tmp[1]), ctx);
1076 emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(r3[1])), ctx);
1077 emit(ARM_MOV_SI(tmp[0], tmp2[1], SRTYPE_ASL, 2), ctx);
1078 emit(ARM_LDR_R(tmp[1], tmp[1], tmp[0]), ctx);
1079 emit(ARM_CMP_I(tmp[1], 0), ctx);
1080 _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx);
1082 /* goto *(prog->bpf_func + prologue_size); */
1083 off = offsetof(struct bpf_prog, bpf_func);
1084 emit_a32_mov_i(tmp2[1], off, ctx);
1085 emit(ARM_LDR_R(tmp[1], tmp[1], tmp2[1]), ctx);
1086 emit(ARM_ADD_I(tmp[1], tmp[1], ctx->prologue_bytes), ctx);
1087 emit_bx_r(tmp[1], ctx);
1090 if (out_offset == -1)
1091 out_offset = cur_offset;
1092 if (cur_offset != out_offset) {
1093 pr_err_once("tail_call out_offset = %d, expected %d!\n",
1094 cur_offset, out_offset);
1102 /* 0xabcd => 0xcdab */
1103 static inline void emit_rev16(const u8 rd, const u8 rn, struct jit_ctx *ctx)
1105 #if __LINUX_ARM_ARCH__ < 6
1106 const s8 *tmp2 = bpf2a32[TMP_REG_2];
1108 emit(ARM_AND_I(tmp2[1], rn, 0xff), ctx);
1109 emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 8), ctx);
1110 emit(ARM_AND_I(tmp2[0], tmp2[0], 0xff), ctx);
1111 emit(ARM_ORR_SI(rd, tmp2[0], tmp2[1], SRTYPE_LSL, 8), ctx);
1113 emit(ARM_REV16(rd, rn), ctx);
1117 /* 0xabcdefgh => 0xghefcdab */
1118 static inline void emit_rev32(const u8 rd, const u8 rn, struct jit_ctx *ctx)
1120 #if __LINUX_ARM_ARCH__ < 6
1121 const s8 *tmp2 = bpf2a32[TMP_REG_2];
1123 emit(ARM_AND_I(tmp2[1], rn, 0xff), ctx);
1124 emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 24), ctx);
1125 emit(ARM_ORR_SI(ARM_IP, tmp2[0], tmp2[1], SRTYPE_LSL, 24), ctx);
1127 emit(ARM_MOV_SI(tmp2[1], rn, SRTYPE_LSR, 8), ctx);
1128 emit(ARM_AND_I(tmp2[1], tmp2[1], 0xff), ctx);
1129 emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 16), ctx);
1130 emit(ARM_AND_I(tmp2[0], tmp2[0], 0xff), ctx);
1131 emit(ARM_MOV_SI(tmp2[0], tmp2[0], SRTYPE_LSL, 8), ctx);
1132 emit(ARM_ORR_SI(tmp2[0], tmp2[0], tmp2[1], SRTYPE_LSL, 16), ctx);
1133 emit(ARM_ORR_R(rd, ARM_IP, tmp2[0]), ctx);
1136 emit(ARM_REV(rd, rn), ctx);
1140 // push the scratch stack register on top of the stack
1141 static inline void emit_push_r64(const s8 src[], const u8 shift,
1142 struct jit_ctx *ctx)
1144 const s8 *tmp2 = bpf2a32[TMP_REG_2];
1147 emit(ARM_LDR_I(tmp2[1], ARM_SP, STACK_VAR(src[1]+shift)), ctx);
1148 emit(ARM_LDR_I(tmp2[0], ARM_SP, STACK_VAR(src[0]+shift)), ctx);
1150 reg_set = (1 << tmp2[1]) | (1 << tmp2[0]);
1151 emit(ARM_PUSH(reg_set), ctx);
1154 static void build_prologue(struct jit_ctx *ctx)
1156 const s8 r0 = bpf2a32[BPF_REG_0][1];
1157 const s8 r2 = bpf2a32[BPF_REG_1][1];
1158 const s8 r3 = bpf2a32[BPF_REG_1][0];
1159 const s8 r4 = bpf2a32[BPF_REG_6][1];
1160 const s8 fplo = bpf2a32[BPF_REG_FP][1];
1161 const s8 fphi = bpf2a32[BPF_REG_FP][0];
1162 const s8 *tcc = bpf2a32[TCALL_CNT];
1164 /* Save callee saved registers. */
1165 #ifdef CONFIG_FRAME_POINTER
1166 u16 reg_set = CALLEE_PUSH_MASK | 1 << ARM_IP | 1 << ARM_PC;
1167 emit(ARM_MOV_R(ARM_IP, ARM_SP), ctx);
1168 emit(ARM_PUSH(reg_set), ctx);
1169 emit(ARM_SUB_I(ARM_FP, ARM_IP, 4), ctx);
1171 emit(ARM_PUSH(CALLEE_PUSH_MASK), ctx);
1172 emit(ARM_MOV_R(ARM_FP, ARM_SP), ctx);
1174 /* Save frame pointer for later */
1175 emit(ARM_SUB_I(ARM_IP, ARM_SP, SCRATCH_SIZE), ctx);
1177 ctx->stack_size = imm8m(STACK_SIZE);
1179 /* Set up function call stack */
1180 emit(ARM_SUB_I(ARM_SP, ARM_SP, ctx->stack_size), ctx);
1182 /* Set up BPF prog stack base register */
1183 emit_a32_mov_r(fplo, ARM_IP, ctx);
1184 emit_a32_mov_i(fphi, 0, ctx);
1187 emit(ARM_MOV_I(r4, 0), ctx);
1189 /* Move BPF_CTX to BPF_R1 */
1190 emit(ARM_MOV_R(r3, r4), ctx);
1191 emit(ARM_MOV_R(r2, r0), ctx);
1192 /* Initialize Tail Count */
1193 emit(ARM_STR_I(r4, ARM_SP, STACK_VAR(tcc[0])), ctx);
1194 emit(ARM_STR_I(r4, ARM_SP, STACK_VAR(tcc[1])), ctx);
1195 /* end of prologue */
1198 /* restore callee saved registers. */
1199 static void build_epilogue(struct jit_ctx *ctx)
1201 #ifdef CONFIG_FRAME_POINTER
1202 /* When using frame pointers, some additional registers need to
1204 u16 reg_set = CALLEE_POP_MASK | 1 << ARM_SP;
1205 emit(ARM_SUB_I(ARM_SP, ARM_FP, hweight16(reg_set) * 4), ctx);
1206 emit(ARM_LDM(ARM_SP, reg_set), ctx);
1208 /* Restore callee saved registers. */
1209 emit(ARM_MOV_R(ARM_SP, ARM_FP), ctx);
1210 emit(ARM_POP(CALLEE_POP_MASK), ctx);
1215 * Convert an eBPF instruction to native instruction, i.e
1216 * JITs an eBPF instruction.
1218 * 0 - Successfully JITed an 8-byte eBPF instruction
1219 * >0 - Successfully JITed a 16-byte eBPF instruction
1220 * <0 - Failed to JIT.
1222 static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
1224 const u8 code = insn->code;
1225 const s8 *dst = bpf2a32[insn->dst_reg];
1226 const s8 *src = bpf2a32[insn->src_reg];
1227 const s8 *tmp = bpf2a32[TMP_REG_1];
1228 const s8 *tmp2 = bpf2a32[TMP_REG_2];
1229 const s16 off = insn->off;
1230 const s32 imm = insn->imm;
1231 const int i = insn - ctx->prog->insnsi;
1232 const bool is64 = BPF_CLASS(code) == BPF_ALU64;
1236 #define check_imm(bits, imm) do { \
1237 if ((imm) >= (1 << ((bits) - 1)) || \
1238 (imm) < -(1 << ((bits) - 1))) { \
1239 pr_info("[%2d] imm=%d(0x%x) out of range\n", \
1244 #define check_imm24(imm) check_imm(24, imm)
1247 /* ALU operations */
1250 case BPF_ALU | BPF_MOV | BPF_K:
1251 case BPF_ALU | BPF_MOV | BPF_X:
1252 case BPF_ALU64 | BPF_MOV | BPF_K:
1253 case BPF_ALU64 | BPF_MOV | BPF_X:
1254 switch (BPF_SRC(code)) {
1256 emit_a32_mov_r64(is64, dst, src, ctx);
1259 /* Sign-extend immediate value to destination reg */
1260 emit_a32_mov_i64(is64, dst, imm, ctx);
1264 /* dst = dst + src/imm */
1265 /* dst = dst - src/imm */
1266 /* dst = dst | src/imm */
1267 /* dst = dst & src/imm */
1268 /* dst = dst ^ src/imm */
1269 /* dst = dst * src/imm */
1270 /* dst = dst << src */
1271 /* dst = dst >> src */
1272 case BPF_ALU | BPF_ADD | BPF_K:
1273 case BPF_ALU | BPF_ADD | BPF_X:
1274 case BPF_ALU | BPF_SUB | BPF_K:
1275 case BPF_ALU | BPF_SUB | BPF_X:
1276 case BPF_ALU | BPF_OR | BPF_K:
1277 case BPF_ALU | BPF_OR | BPF_X:
1278 case BPF_ALU | BPF_AND | BPF_K:
1279 case BPF_ALU | BPF_AND | BPF_X:
1280 case BPF_ALU | BPF_XOR | BPF_K:
1281 case BPF_ALU | BPF_XOR | BPF_X:
1282 case BPF_ALU | BPF_MUL | BPF_K:
1283 case BPF_ALU | BPF_MUL | BPF_X:
1284 case BPF_ALU | BPF_LSH | BPF_X:
1285 case BPF_ALU | BPF_RSH | BPF_X:
1286 case BPF_ALU | BPF_ARSH | BPF_K:
1287 case BPF_ALU | BPF_ARSH | BPF_X:
1288 case BPF_ALU64 | BPF_ADD | BPF_K:
1289 case BPF_ALU64 | BPF_ADD | BPF_X:
1290 case BPF_ALU64 | BPF_SUB | BPF_K:
1291 case BPF_ALU64 | BPF_SUB | BPF_X:
1292 case BPF_ALU64 | BPF_OR | BPF_K:
1293 case BPF_ALU64 | BPF_OR | BPF_X:
1294 case BPF_ALU64 | BPF_AND | BPF_K:
1295 case BPF_ALU64 | BPF_AND | BPF_X:
1296 case BPF_ALU64 | BPF_XOR | BPF_K:
1297 case BPF_ALU64 | BPF_XOR | BPF_X:
1298 switch (BPF_SRC(code)) {
1300 emit_a32_alu_r64(is64, dst, src, ctx, BPF_OP(code));
1303 /* Move immediate value to the temporary register
1304 * and then do the ALU operation on the temporary
1305 * register as this will sign-extend the immediate
1306 * value into temporary reg and then it would be
1307 * safe to do the operation on it.
1309 emit_a32_mov_i64(is64, tmp2, imm, ctx);
1310 emit_a32_alu_r64(is64, dst, tmp2, ctx, BPF_OP(code));
1314 /* dst = dst / src(imm) */
1315 /* dst = dst % src(imm) */
1316 case BPF_ALU | BPF_DIV | BPF_K:
1317 case BPF_ALU | BPF_DIV | BPF_X:
1318 case BPF_ALU | BPF_MOD | BPF_K:
1319 case BPF_ALU | BPF_MOD | BPF_X:
1320 rd = is_stacked(dst_lo) ? tmp2[1] : dst_lo;
1321 if (is_stacked(dst_lo))
1322 emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx);
1323 switch (BPF_SRC(code)) {
1325 rt = is_stacked(rt) ? tmp2[0] : src_lo;
1326 if (is_stacked(src_lo))
1327 emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(src_lo)),
1332 emit_a32_mov_i(rt, imm, ctx);
1338 emit_udivmod(rd, rd, rt, ctx, BPF_OP(code));
1339 if (is_stacked(dst_lo))
1340 emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_lo)), ctx);
1341 emit_a32_mov_i(dst_hi, 0, ctx);
1343 case BPF_ALU64 | BPF_DIV | BPF_K:
1344 case BPF_ALU64 | BPF_DIV | BPF_X:
1345 case BPF_ALU64 | BPF_MOD | BPF_K:
1346 case BPF_ALU64 | BPF_MOD | BPF_X:
1348 /* dst = dst >> imm */
1349 /* dst = dst << imm */
1350 case BPF_ALU | BPF_RSH | BPF_K:
1351 case BPF_ALU | BPF_LSH | BPF_K:
1352 if (unlikely(imm > 31))
1355 emit_a32_alu_i(dst_lo, imm, ctx, BPF_OP(code));
1356 emit_a32_mov_i(dst_hi, 0, ctx);
1358 /* dst = dst << imm */
1359 case BPF_ALU64 | BPF_LSH | BPF_K:
1360 if (unlikely(imm > 63))
1362 emit_a32_lsh_i64(dst, imm, ctx);
1364 /* dst = dst >> imm */
1365 case BPF_ALU64 | BPF_RSH | BPF_K:
1366 if (unlikely(imm > 63))
1368 emit_a32_rsh_i64(dst, imm, ctx);
1370 /* dst = dst << src */
1371 case BPF_ALU64 | BPF_LSH | BPF_X:
1372 emit_a32_lsh_r64(dst, src, ctx);
1374 /* dst = dst >> src */
1375 case BPF_ALU64 | BPF_RSH | BPF_X:
1376 emit_a32_rsh_r64(dst, src, ctx);
1378 /* dst = dst >> src (signed) */
1379 case BPF_ALU64 | BPF_ARSH | BPF_X:
1380 emit_a32_arsh_r64(dst, src, ctx);
1382 /* dst = dst >> imm (signed) */
1383 case BPF_ALU64 | BPF_ARSH | BPF_K:
1384 if (unlikely(imm > 63))
1386 emit_a32_arsh_i64(dst, imm, ctx);
1389 case BPF_ALU | BPF_NEG:
1390 emit_a32_alu_i(dst_lo, 0, ctx, BPF_OP(code));
1391 emit_a32_mov_i(dst_hi, 0, ctx);
1393 /* dst = ~dst (64 bit) */
1394 case BPF_ALU64 | BPF_NEG:
1395 emit_a32_neg64(dst, ctx);
1397 /* dst = dst * src/imm */
1398 case BPF_ALU64 | BPF_MUL | BPF_X:
1399 case BPF_ALU64 | BPF_MUL | BPF_K:
1400 switch (BPF_SRC(code)) {
1402 emit_a32_mul_r64(dst, src, ctx);
1405 /* Move immediate value to the temporary register
1406 * and then do the multiplication on it as this
1407 * will sign-extend the immediate value into temp
1408 * reg then it would be safe to do the operation
1411 emit_a32_mov_i64(is64, tmp2, imm, ctx);
1412 emit_a32_mul_r64(dst, tmp2, ctx);
1416 /* dst = htole(dst) */
1417 /* dst = htobe(dst) */
1418 case BPF_ALU | BPF_END | BPF_FROM_LE:
1419 case BPF_ALU | BPF_END | BPF_FROM_BE:
1420 rd = is_stacked(dst_lo) ? tmp[0] : dst_hi;
1421 rt = is_stacked(dst_lo) ? tmp[1] : dst_lo;
1422 if (is_stacked(dst_lo)) {
1423 emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(dst_lo)), ctx);
1424 emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_hi)), ctx);
1426 if (BPF_SRC(code) == BPF_FROM_LE)
1427 goto emit_bswap_uxt;
1430 emit_rev16(rt, rt, ctx);
1431 goto emit_bswap_uxt;
1433 emit_rev32(rt, rt, ctx);
1434 goto emit_bswap_uxt;
1436 emit_rev32(ARM_LR, rt, ctx);
1437 emit_rev32(rt, rd, ctx);
1438 emit(ARM_MOV_R(rd, ARM_LR), ctx);
1445 /* zero-extend 16 bits into 64 bits */
1446 #if __LINUX_ARM_ARCH__ < 6
1447 emit_a32_mov_i(tmp2[1], 0xffff, ctx);
1448 emit(ARM_AND_R(rt, rt, tmp2[1]), ctx);
1450 emit(ARM_UXTH(rt, rt), ctx);
1452 emit(ARM_EOR_R(rd, rd, rd), ctx);
1455 /* zero-extend 32 bits into 64 bits */
1456 emit(ARM_EOR_R(rd, rd, rd), ctx);
1463 if (is_stacked(dst_lo)) {
1464 emit(ARM_STR_I(rt, ARM_SP, STACK_VAR(dst_lo)), ctx);
1465 emit(ARM_STR_I(rd, ARM_SP, STACK_VAR(dst_hi)), ctx);
1469 case BPF_LD | BPF_IMM | BPF_DW:
1471 const struct bpf_insn insn1 = insn[1];
1475 emit_a32_mov_i(dst_lo, lo, ctx);
1476 emit_a32_mov_i(dst_hi, hi, ctx);
1480 /* LDX: dst = *(size *)(src + off) */
1481 case BPF_LDX | BPF_MEM | BPF_W:
1482 case BPF_LDX | BPF_MEM | BPF_H:
1483 case BPF_LDX | BPF_MEM | BPF_B:
1484 case BPF_LDX | BPF_MEM | BPF_DW:
1485 rn = is_stacked(src_lo) ? tmp2[1] : src_lo;
1486 if (is_stacked(src_lo))
1487 emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_lo)), ctx);
1488 emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code));
1490 /* ST: *(size *)(dst + off) = imm */
1491 case BPF_ST | BPF_MEM | BPF_W:
1492 case BPF_ST | BPF_MEM | BPF_H:
1493 case BPF_ST | BPF_MEM | BPF_B:
1494 case BPF_ST | BPF_MEM | BPF_DW:
1495 switch (BPF_SIZE(code)) {
1497 /* Sign-extend immediate value into temp reg */
1498 emit_a32_mov_i64(true, tmp2, imm, ctx);
1499 emit_str_r(dst_lo, tmp2[1], off, ctx, BPF_W);
1500 emit_str_r(dst_lo, tmp2[0], off+4, ctx, BPF_W);
1505 emit_a32_mov_i(tmp2[1], imm, ctx);
1506 emit_str_r(dst_lo, tmp2[1], off, ctx, BPF_SIZE(code));
1510 /* STX XADD: lock *(u32 *)(dst + off) += src */
1511 case BPF_STX | BPF_XADD | BPF_W:
1512 /* STX XADD: lock *(u64 *)(dst + off) += src */
1513 case BPF_STX | BPF_XADD | BPF_DW:
1515 /* STX: *(size *)(dst + off) = src */
1516 case BPF_STX | BPF_MEM | BPF_W:
1517 case BPF_STX | BPF_MEM | BPF_H:
1518 case BPF_STX | BPF_MEM | BPF_B:
1519 case BPF_STX | BPF_MEM | BPF_DW:
1521 u8 sz = BPF_SIZE(code);
1523 rn = is_stacked(src_lo) ? tmp2[1] : src_lo;
1524 rm = is_stacked(src_lo) ? tmp2[0] : src_hi;
1525 if (is_stacked(src_lo)) {
1526 emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_lo)), ctx);
1527 emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(src_hi)), ctx);
1530 /* Store the value */
1531 if (BPF_SIZE(code) == BPF_DW) {
1532 emit_str_r(dst_lo, rn, off, ctx, BPF_W);
1533 emit_str_r(dst_lo, rm, off+4, ctx, BPF_W);
1535 emit_str_r(dst_lo, rn, off, ctx, sz);
1539 /* PC += off if dst == src */
1540 /* PC += off if dst > src */
1541 /* PC += off if dst >= src */
1542 /* PC += off if dst < src */
1543 /* PC += off if dst <= src */
1544 /* PC += off if dst != src */
1545 /* PC += off if dst > src (signed) */
1546 /* PC += off if dst >= src (signed) */
1547 /* PC += off if dst < src (signed) */
1548 /* PC += off if dst <= src (signed) */
1549 /* PC += off if dst & src */
1550 case BPF_JMP | BPF_JEQ | BPF_X:
1551 case BPF_JMP | BPF_JGT | BPF_X:
1552 case BPF_JMP | BPF_JGE | BPF_X:
1553 case BPF_JMP | BPF_JNE | BPF_X:
1554 case BPF_JMP | BPF_JSGT | BPF_X:
1555 case BPF_JMP | BPF_JSGE | BPF_X:
1556 case BPF_JMP | BPF_JSET | BPF_X:
1557 case BPF_JMP | BPF_JLE | BPF_X:
1558 case BPF_JMP | BPF_JLT | BPF_X:
1559 case BPF_JMP | BPF_JSLT | BPF_X:
1560 case BPF_JMP | BPF_JSLE | BPF_X:
1561 /* Setup source registers */
1562 rm = is_stacked(src_lo) ? tmp2[0] : src_hi;
1563 rn = is_stacked(src_lo) ? tmp2[1] : src_lo;
1564 if (is_stacked(src_lo)) {
1565 emit(ARM_LDR_I(rn, ARM_SP, STACK_VAR(src_lo)), ctx);
1566 emit(ARM_LDR_I(rm, ARM_SP, STACK_VAR(src_hi)), ctx);
1569 /* PC += off if dst == imm */
1570 /* PC += off if dst > imm */
1571 /* PC += off if dst >= imm */
1572 /* PC += off if dst < imm */
1573 /* PC += off if dst <= imm */
1574 /* PC += off if dst != imm */
1575 /* PC += off if dst > imm (signed) */
1576 /* PC += off if dst >= imm (signed) */
1577 /* PC += off if dst < imm (signed) */
1578 /* PC += off if dst <= imm (signed) */
1579 /* PC += off if dst & imm */
1580 case BPF_JMP | BPF_JEQ | BPF_K:
1581 case BPF_JMP | BPF_JGT | BPF_K:
1582 case BPF_JMP | BPF_JGE | BPF_K:
1583 case BPF_JMP | BPF_JNE | BPF_K:
1584 case BPF_JMP | BPF_JSGT | BPF_K:
1585 case BPF_JMP | BPF_JSGE | BPF_K:
1586 case BPF_JMP | BPF_JSET | BPF_K:
1587 case BPF_JMP | BPF_JLT | BPF_K:
1588 case BPF_JMP | BPF_JLE | BPF_K:
1589 case BPF_JMP | BPF_JSLT | BPF_K:
1590 case BPF_JMP | BPF_JSLE | BPF_K:
1595 /* Sign-extend immediate value */
1596 emit_a32_mov_i64(true, tmp2, imm, ctx);
1598 /* Setup destination register */
1599 rd = is_stacked(dst_lo) ? tmp[0] : dst_hi;
1600 rt = is_stacked(dst_lo) ? tmp[1] : dst_lo;
1601 if (is_stacked(dst_lo)) {
1602 emit(ARM_LDR_I(rt, ARM_SP, STACK_VAR(dst_lo)), ctx);
1603 emit(ARM_LDR_I(rd, ARM_SP, STACK_VAR(dst_hi)), ctx);
1606 /* Check for the condition */
1607 emit_ar_r(rd, rt, rm, rn, ctx, BPF_OP(code));
1609 /* Setup JUMP instruction */
1610 jmp_offset = bpf2a32_offset(i+off, i, ctx);
1611 switch (BPF_OP(code)) {
1614 _emit(ARM_COND_NE, ARM_B(jmp_offset), ctx);
1617 _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx);
1620 _emit(ARM_COND_HI, ARM_B(jmp_offset), ctx);
1623 _emit(ARM_COND_CS, ARM_B(jmp_offset), ctx);
1626 _emit(ARM_COND_LT, ARM_B(jmp_offset), ctx);
1629 _emit(ARM_COND_GE, ARM_B(jmp_offset), ctx);
1632 _emit(ARM_COND_LS, ARM_B(jmp_offset), ctx);
1635 _emit(ARM_COND_CC, ARM_B(jmp_offset), ctx);
1638 _emit(ARM_COND_LT, ARM_B(jmp_offset), ctx);
1641 _emit(ARM_COND_GE, ARM_B(jmp_offset), ctx);
1646 case BPF_JMP | BPF_JA:
1650 jmp_offset = bpf2a32_offset(i+off, i, ctx);
1651 check_imm24(jmp_offset);
1652 emit(ARM_B(jmp_offset), ctx);
1656 case BPF_JMP | BPF_TAIL_CALL:
1657 if (emit_bpf_tail_call(ctx))
1661 case BPF_JMP | BPF_CALL:
1663 const s8 *r0 = bpf2a32[BPF_REG_0];
1664 const s8 *r1 = bpf2a32[BPF_REG_1];
1665 const s8 *r2 = bpf2a32[BPF_REG_2];
1666 const s8 *r3 = bpf2a32[BPF_REG_3];
1667 const s8 *r4 = bpf2a32[BPF_REG_4];
1668 const s8 *r5 = bpf2a32[BPF_REG_5];
1669 const u32 func = (u32)__bpf_call_base + (u32)imm;
1671 emit_a32_mov_r64(true, r0, r1, ctx);
1672 emit_a32_mov_r64(true, r1, r2, ctx);
1673 emit_push_r64(r5, 0, ctx);
1674 emit_push_r64(r4, 8, ctx);
1675 emit_push_r64(r3, 16, ctx);
1677 emit_a32_mov_i(tmp[1], func, ctx);
1678 emit_blx_r(tmp[1], ctx);
1680 emit(ARM_ADD_I(ARM_SP, ARM_SP, imm8m(24)), ctx); // callee clean
1683 /* function return */
1684 case BPF_JMP | BPF_EXIT:
1685 /* Optimization: when last instruction is EXIT
1686 * simply fallthrough to epilogue.
1688 if (i == ctx->prog->len - 1)
1690 jmp_offset = epilogue_offset(ctx);
1691 check_imm24(jmp_offset);
1692 emit(ARM_B(jmp_offset), ctx);
1695 pr_info_once("*** NOT YET: opcode %02x ***\n", code);
1698 pr_err_once("unknown opcode %02x\n", code);
1702 if (ctx->flags & FLAG_IMM_OVERFLOW)
1704 * this instruction generated an overflow when
1705 * trying to access the literal pool, so
1706 * delegate this filter to the kernel interpreter.
1712 static int build_body(struct jit_ctx *ctx)
1714 const struct bpf_prog *prog = ctx->prog;
1717 for (i = 0; i < prog->len; i++) {
1718 const struct bpf_insn *insn = &(prog->insnsi[i]);
1721 ret = build_insn(insn, ctx);
1723 /* It's used with loading the 64 bit immediate value. */
1726 if (ctx->target == NULL)
1727 ctx->offsets[i] = ctx->idx;
1731 if (ctx->target == NULL)
1732 ctx->offsets[i] = ctx->idx;
1734 /* If unsuccesfull, return with error code */
1741 static int validate_code(struct jit_ctx *ctx)
1745 for (i = 0; i < ctx->idx; i++) {
1746 if (ctx->target[i] == __opcode_to_mem_arm(ARM_INST_UDF))
1753 void bpf_jit_compile(struct bpf_prog *prog)
1755 /* Nothing to do here. We support Internal BPF. */
1758 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
1760 struct bpf_prog *tmp, *orig_prog = prog;
1761 struct bpf_binary_header *header;
1762 bool tmp_blinded = false;
1764 unsigned int tmp_idx;
1765 unsigned int image_size;
1768 /* If BPF JIT was not enabled then we must fall back to
1771 if (!prog->jit_requested)
1774 /* If constant blinding was enabled and we failed during blinding
1775 * then we must fall back to the interpreter. Otherwise, we save
1776 * the new JITed code.
1778 tmp = bpf_jit_blind_constants(prog);
1787 memset(&ctx, 0, sizeof(ctx));
1790 /* Not able to allocate memory for offsets[] , then
1791 * we must fall back to the interpreter
1793 ctx.offsets = kcalloc(prog->len, sizeof(int), GFP_KERNEL);
1794 if (ctx.offsets == NULL) {
1799 /* 1) fake pass to find in the length of the JITed code,
1800 * to compute ctx->offsets and other context variables
1801 * needed to compute final JITed code.
1802 * Also, calculate random starting pointer/start of JITed code
1803 * which is prefixed by random number of fault instructions.
1805 * If the first pass fails then there is no chance of it
1806 * being successful in the second pass, so just fall back
1807 * to the interpreter.
1809 if (build_body(&ctx)) {
1815 build_prologue(&ctx);
1816 ctx.prologue_bytes = (ctx.idx - tmp_idx) * 4;
1818 ctx.epilogue_offset = ctx.idx;
1820 #if __LINUX_ARM_ARCH__ < 7
1822 build_epilogue(&ctx);
1823 ctx.epilogue_bytes = (ctx.idx - tmp_idx) * 4;
1825 ctx.idx += ctx.imm_count;
1826 if (ctx.imm_count) {
1827 ctx.imms = kcalloc(ctx.imm_count, sizeof(u32), GFP_KERNEL);
1828 if (ctx.imms == NULL) {
1834 /* there's nothing about the epilogue on ARMv7 */
1835 build_epilogue(&ctx);
1837 /* Now we can get the actual image size of the JITed arm code.
1838 * Currently, we are not considering the THUMB-2 instructions
1839 * for jit, although it can decrease the size of the image.
1841 * As each arm instruction is of length 32bit, we are translating
1842 * number of JITed intructions into the size required to store these
1845 image_size = sizeof(u32) * ctx.idx;
1847 /* Now we know the size of the structure to make */
1848 header = bpf_jit_binary_alloc(image_size, &image_ptr,
1849 sizeof(u32), jit_fill_hole);
1850 /* Not able to allocate memory for the structure then
1851 * we must fall back to the interpretation
1853 if (header == NULL) {
1858 /* 2.) Actual pass to generate final JIT code */
1859 ctx.target = (u32 *) image_ptr;
1862 build_prologue(&ctx);
1864 /* If building the body of the JITed code fails somehow,
1865 * we fall back to the interpretation.
1867 if (build_body(&ctx) < 0) {
1869 bpf_jit_binary_free(header);
1873 build_epilogue(&ctx);
1875 /* 3.) Extra pass to validate JITed Code */
1876 if (validate_code(&ctx)) {
1878 bpf_jit_binary_free(header);
1882 flush_icache_range((u32)header, (u32)(ctx.target + ctx.idx));
1884 if (bpf_jit_enable > 1)
1885 /* there are 2 passes here */
1886 bpf_jit_dump(prog->len, image_size, 2, ctx.target);
1888 bpf_jit_binary_lock_ro(header);
1889 prog->bpf_func = (void *)ctx.target;
1891 prog->jited_len = image_size;
1894 #if __LINUX_ARM_ARCH__ < 7
1902 bpf_jit_prog_release_other(prog, prog == orig_prog ?