Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
authorDavid S. Miller <davem@davemloft.net>
Wed, 22 Jul 2020 19:34:55 +0000 (12:34 -0700)
committerDavid S. Miller <davem@davemloft.net>
Wed, 22 Jul 2020 19:35:33 +0000 (12:35 -0700)
Alexei Starovoitov says:

====================
pull-request: bpf-next 2020-07-21

The following pull-request contains BPF updates for your *net-next* tree.

We've added 46 non-merge commits during the last 6 day(s) which contain
a total of 68 files changed, 4929 insertions(+), 526 deletions(-).

The main changes are:

1) Run BPF program on socket lookup, from Jakub.

2) Introduce cpumap, from Lorenzo.

3) s390 JIT fixes, from Ilya.

4) teach riscv JIT to emit compressed insns, from Luke.

5) use build time computed BTF ids in bpf iter, from Yonghong.
====================

Purely independent overlapping changes in both filter.h and xdp.h

Signed-off-by: David S. Miller <davem@davemloft.net>
68 files changed:
arch/riscv/net/bpf_jit.h
arch/riscv/net/bpf_jit_comp32.c
arch/riscv/net/bpf_jit_comp64.c
arch/riscv/net/bpf_jit_core.c
arch/s390/net/bpf_jit_comp.c
include/linux/bpf-netns.h
include/linux/bpf.h
include/linux/bpf_types.h
include/linux/btf_ids.h
include/linux/filter.h
include/net/xdp.h
include/trace/events/xdp.h
include/uapi/linux/bpf.h
kernel/bpf/btf.c
kernel/bpf/core.c
kernel/bpf/cpumap.c
kernel/bpf/map_iter.c
kernel/bpf/net_namespace.c
kernel/bpf/syscall.c
kernel/bpf/task_iter.c
kernel/bpf/verifier.c
lib/test_bpf.c
net/core/dev.c
net/core/filter.c
net/ipv4/inet_hashtables.c
net/ipv4/tcp_ipv4.c
net/ipv4/udp.c
net/ipv6/inet6_hashtables.c
net/ipv6/route.c
net/ipv6/udp.c
net/netlink/af_netlink.c
samples/bpf/offwaketime_kern.c
samples/bpf/test_overhead_kprobe_kern.c
samples/bpf/tracex1_kern.c
samples/bpf/tracex5_kern.c
samples/bpf/xdp_redirect_cpu_kern.c
samples/bpf/xdp_redirect_cpu_user.c
scripts/bpf_helpers_doc.py
tools/bpf/bpftool/Documentation/bpftool-prog.rst
tools/bpf/bpftool/bash-completion/bpftool
tools/bpf/bpftool/common.c
tools/bpf/bpftool/gen.c
tools/bpf/bpftool/main.h
tools/bpf/bpftool/prog.c
tools/bpf/bpftool/skeleton/pid_iter.bpf.c
tools/include/linux/btf_ids.h
tools/include/uapi/linux/bpf.h
tools/lib/bpf/bpf_helpers.h
tools/lib/bpf/libbpf.c
tools/lib/bpf/libbpf.h
tools/lib/bpf/libbpf.map
tools/lib/bpf/libbpf_probes.c
tools/testing/selftests/bpf/network_helpers.c
tools/testing/selftests/bpf/network_helpers.h
tools/testing/selftests/bpf/prog_tests/resolve_btfids.c
tools/testing/selftests/bpf/prog_tests/sk_lookup.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
tools/testing/selftests/bpf/progs/bpf_iter_udp4.c
tools/testing/selftests/bpf/progs/bpf_iter_udp6.c
tools/testing/selftests/bpf/progs/test_sk_lookup.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c [new file with mode: 0644]
tools/testing/selftests/bpf/test_kmod.sh
tools/testing/selftests/bpf/test_lwt_seg6local.sh
tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c [new file with mode: 0644]

index 20e235d..75c1e99 100644 (file)
 #include <linux/filter.h>
 #include <asm/cacheflush.h>
 
+static inline bool rvc_enabled(void)
+{
+       return IS_ENABLED(CONFIG_RISCV_ISA_C);
+}
+
 enum {
        RV_REG_ZERO =   0,      /* The constant value 0 */
        RV_REG_RA =     1,      /* Return address */
@@ -48,9 +53,21 @@ enum {
        RV_REG_T6 =     31,
 };
 
+static inline bool is_creg(u8 reg)
+{
+       return (1 << reg) & (BIT(RV_REG_FP) |
+                            BIT(RV_REG_S1) |
+                            BIT(RV_REG_A0) |
+                            BIT(RV_REG_A1) |
+                            BIT(RV_REG_A2) |
+                            BIT(RV_REG_A3) |
+                            BIT(RV_REG_A4) |
+                            BIT(RV_REG_A5));
+}
+
 struct rv_jit_context {
        struct bpf_prog *prog;
-       u32 *insns;             /* RV insns */
+       u16 *insns;             /* RV insns */
        int ninsns;
        int epilogue_offset;
        int *offset;            /* BPF to RV */
@@ -58,6 +75,12 @@ struct rv_jit_context {
        int stack_size;
 };
 
+/* Convert from ninsns to bytes. */
+static inline int ninsns_rvoff(int ninsns)
+{
+       return ninsns << 1;
+}
+
 struct rv_jit_data {
        struct bpf_binary_header *header;
        u8 *image;
@@ -74,8 +97,22 @@ static inline void bpf_flush_icache(void *start, void *end)
        flush_icache_range((unsigned long)start, (unsigned long)end);
 }
 
+/* Emit a 4-byte riscv instruction. */
 static inline void emit(const u32 insn, struct rv_jit_context *ctx)
 {
+       if (ctx->insns) {
+               ctx->insns[ctx->ninsns] = insn;
+               ctx->insns[ctx->ninsns + 1] = (insn >> 16);
+       }
+
+       ctx->ninsns += 2;
+}
+
+/* Emit a 2-byte riscv compressed instruction. */
+static inline void emitc(const u16 insn, struct rv_jit_context *ctx)
+{
+       BUILD_BUG_ON(!rvc_enabled());
+
        if (ctx->insns)
                ctx->insns[ctx->ninsns] = insn;
 
@@ -86,7 +123,7 @@ static inline int epilogue_offset(struct rv_jit_context *ctx)
 {
        int to = ctx->epilogue_offset, from = ctx->ninsns;
 
-       return (to - from) << 2;
+       return ninsns_rvoff(to - from);
 }
 
 /* Return -1 or inverted cond. */
@@ -117,6 +154,36 @@ static inline int invert_bpf_cond(u8 cond)
        return -1;
 }
 
+static inline bool is_6b_int(long val)
+{
+       return -(1L << 5) <= val && val < (1L << 5);
+}
+
+static inline bool is_7b_uint(unsigned long val)
+{
+       return val < (1UL << 7);
+}
+
+static inline bool is_8b_uint(unsigned long val)
+{
+       return val < (1UL << 8);
+}
+
+static inline bool is_9b_uint(unsigned long val)
+{
+       return val < (1UL << 9);
+}
+
+static inline bool is_10b_int(long val)
+{
+       return -(1L << 9) <= val && val < (1L << 9);
+}
+
+static inline bool is_10b_uint(unsigned long val)
+{
+       return val < (1UL << 10);
+}
+
 static inline bool is_12b_int(long val)
 {
        return -(1L << 11) <= val && val < (1L << 11);
@@ -149,7 +216,7 @@ static inline int rv_offset(int insn, int off, struct rv_jit_context *ctx)
        off++; /* BPF branch is from PC+1, RV is from PC */
        from = (insn > 0) ? ctx->offset[insn - 1] : 0;
        to = (insn + off > 0) ? ctx->offset[insn + off - 1] : 0;
-       return (to - from) << 2;
+       return ninsns_rvoff(to - from);
 }
 
 /* Instruction formats. */
@@ -207,6 +274,59 @@ static inline u32 rv_amo_insn(u8 funct5, u8 aq, u8 rl, u8 rs2, u8 rs1,
        return rv_r_insn(funct7, rs2, rs1, funct3, rd, opcode);
 }
 
+/* RISC-V compressed instruction formats. */
+
+static inline u16 rv_cr_insn(u8 funct4, u8 rd, u8 rs2, u8 op)
+{
+       return (funct4 << 12) | (rd << 7) | (rs2 << 2) | op;
+}
+
+static inline u16 rv_ci_insn(u8 funct3, u32 imm6, u8 rd, u8 op)
+{
+       u32 imm;
+
+       imm = ((imm6 & 0x20) << 7) | ((imm6 & 0x1f) << 2);
+       return (funct3 << 13) | (rd << 7) | op | imm;
+}
+
+static inline u16 rv_css_insn(u8 funct3, u32 uimm, u8 rs2, u8 op)
+{
+       return (funct3 << 13) | (uimm << 7) | (rs2 << 2) | op;
+}
+
+static inline u16 rv_ciw_insn(u8 funct3, u32 uimm, u8 rd, u8 op)
+{
+       return (funct3 << 13) | (uimm << 5) | ((rd & 0x7) << 2) | op;
+}
+
+static inline u16 rv_cl_insn(u8 funct3, u32 imm_hi, u8 rs1, u32 imm_lo, u8 rd,
+                            u8 op)
+{
+       return (funct3 << 13) | (imm_hi << 10) | ((rs1 & 0x7) << 7) |
+               (imm_lo << 5) | ((rd & 0x7) << 2) | op;
+}
+
+static inline u16 rv_cs_insn(u8 funct3, u32 imm_hi, u8 rs1, u32 imm_lo, u8 rs2,
+                            u8 op)
+{
+       return (funct3 << 13) | (imm_hi << 10) | ((rs1 & 0x7) << 7) |
+               (imm_lo << 5) | ((rs2 & 0x7) << 2) | op;
+}
+
+static inline u16 rv_ca_insn(u8 funct6, u8 rd, u8 funct2, u8 rs2, u8 op)
+{
+       return (funct6 << 10) | ((rd & 0x7) << 7) | (funct2 << 5) |
+               ((rs2 & 0x7) << 2) | op;
+}
+
+static inline u16 rv_cb_insn(u8 funct3, u32 imm6, u8 funct2, u8 rd, u8 op)
+{
+       u32 imm;
+
+       imm = ((imm6 & 0x20) << 7) | ((imm6 & 0x1f) << 2);
+       return (funct3 << 13) | (funct2 << 10) | ((rd & 0x7) << 7) | op | imm;
+}
+
 /* Instructions shared by both RV32 and RV64. */
 
 static inline u32 rv_addi(u8 rd, u8 rs1, u16 imm11_0)
@@ -414,6 +534,135 @@ static inline u32 rv_amoadd_w(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl)
        return rv_amo_insn(0, aq, rl, rs2, rs1, 2, rd, 0x2f);
 }
 
+/* RVC instrutions. */
+
+static inline u16 rvc_addi4spn(u8 rd, u32 imm10)
+{
+       u32 imm;
+
+       imm = ((imm10 & 0x30) << 2) | ((imm10 & 0x3c0) >> 4) |
+               ((imm10 & 0x4) >> 1) | ((imm10 & 0x8) >> 3);
+       return rv_ciw_insn(0x0, imm, rd, 0x0);
+}
+
+static inline u16 rvc_lw(u8 rd, u32 imm7, u8 rs1)
+{
+       u32 imm_hi, imm_lo;
+
+       imm_hi = (imm7 & 0x38) >> 3;
+       imm_lo = ((imm7 & 0x4) >> 1) | ((imm7 & 0x40) >> 6);
+       return rv_cl_insn(0x2, imm_hi, rs1, imm_lo, rd, 0x0);
+}
+
+static inline u16 rvc_sw(u8 rs1, u32 imm7, u8 rs2)
+{
+       u32 imm_hi, imm_lo;
+
+       imm_hi = (imm7 & 0x38) >> 3;
+       imm_lo = ((imm7 & 0x4) >> 1) | ((imm7 & 0x40) >> 6);
+       return rv_cs_insn(0x6, imm_hi, rs1, imm_lo, rs2, 0x0);
+}
+
+static inline u16 rvc_addi(u8 rd, u32 imm6)
+{
+       return rv_ci_insn(0, imm6, rd, 0x1);
+}
+
+static inline u16 rvc_li(u8 rd, u32 imm6)
+{
+       return rv_ci_insn(0x2, imm6, rd, 0x1);
+}
+
+static inline u16 rvc_addi16sp(u32 imm10)
+{
+       u32 imm;
+
+       imm = ((imm10 & 0x200) >> 4) | (imm10 & 0x10) | ((imm10 & 0x40) >> 3) |
+               ((imm10 & 0x180) >> 6) | ((imm10 & 0x20) >> 5);
+       return rv_ci_insn(0x3, imm, RV_REG_SP, 0x1);
+}
+
+static inline u16 rvc_lui(u8 rd, u32 imm6)
+{
+       return rv_ci_insn(0x3, imm6, rd, 0x1);
+}
+
+static inline u16 rvc_srli(u8 rd, u32 imm6)
+{
+       return rv_cb_insn(0x4, imm6, 0, rd, 0x1);
+}
+
+static inline u16 rvc_srai(u8 rd, u32 imm6)
+{
+       return rv_cb_insn(0x4, imm6, 0x1, rd, 0x1);
+}
+
+static inline u16 rvc_andi(u8 rd, u32 imm6)
+{
+       return rv_cb_insn(0x4, imm6, 0x2, rd, 0x1);
+}
+
+static inline u16 rvc_sub(u8 rd, u8 rs)
+{
+       return rv_ca_insn(0x23, rd, 0, rs, 0x1);
+}
+
+static inline u16 rvc_xor(u8 rd, u8 rs)
+{
+       return rv_ca_insn(0x23, rd, 0x1, rs, 0x1);
+}
+
+static inline u16 rvc_or(u8 rd, u8 rs)
+{
+       return rv_ca_insn(0x23, rd, 0x2, rs, 0x1);
+}
+
+static inline u16 rvc_and(u8 rd, u8 rs)
+{
+       return rv_ca_insn(0x23, rd, 0x3, rs, 0x1);
+}
+
+static inline u16 rvc_slli(u8 rd, u32 imm6)
+{
+       return rv_ci_insn(0, imm6, rd, 0x2);
+}
+
+static inline u16 rvc_lwsp(u8 rd, u32 imm8)
+{
+       u32 imm;
+
+       imm = ((imm8 & 0xc0) >> 6) | (imm8 & 0x3c);
+       return rv_ci_insn(0x2, imm, rd, 0x2);
+}
+
+static inline u16 rvc_jr(u8 rs1)
+{
+       return rv_cr_insn(0x8, rs1, RV_REG_ZERO, 0x2);
+}
+
+static inline u16 rvc_mv(u8 rd, u8 rs)
+{
+       return rv_cr_insn(0x8, rd, rs, 0x2);
+}
+
+static inline u16 rvc_jalr(u8 rs1)
+{
+       return rv_cr_insn(0x9, rs1, RV_REG_ZERO, 0x2);
+}
+
+static inline u16 rvc_add(u8 rd, u8 rs)
+{
+       return rv_cr_insn(0x9, rd, rs, 0x2);
+}
+
+static inline u16 rvc_swsp(u32 imm8, u8 rs2)
+{
+       u32 imm;
+
+       imm = (imm8 & 0x3c) | ((imm8 & 0xc0) >> 6);
+       return rv_css_insn(0x6, imm, rs2, 0x2);
+}
+
 /*
  * RV64-only instructions.
  *
@@ -503,6 +752,234 @@ static inline u32 rv_amoadd_d(u8 rd, u8 rs2, u8 rs1, u8 aq, u8 rl)
        return rv_amo_insn(0, aq, rl, rs2, rs1, 3, rd, 0x2f);
 }
 
+/* RV64-only RVC instructions. */
+
+static inline u16 rvc_ld(u8 rd, u32 imm8, u8 rs1)
+{
+       u32 imm_hi, imm_lo;
+
+       imm_hi = (imm8 & 0x38) >> 3;
+       imm_lo = (imm8 & 0xc0) >> 6;
+       return rv_cl_insn(0x3, imm_hi, rs1, imm_lo, rd, 0x0);
+}
+
+static inline u16 rvc_sd(u8 rs1, u32 imm8, u8 rs2)
+{
+       u32 imm_hi, imm_lo;
+
+       imm_hi = (imm8 & 0x38) >> 3;
+       imm_lo = (imm8 & 0xc0) >> 6;
+       return rv_cs_insn(0x7, imm_hi, rs1, imm_lo, rs2, 0x0);
+}
+
+static inline u16 rvc_subw(u8 rd, u8 rs)
+{
+       return rv_ca_insn(0x27, rd, 0, rs, 0x1);
+}
+
+static inline u16 rvc_addiw(u8 rd, u32 imm6)
+{
+       return rv_ci_insn(0x1, imm6, rd, 0x1);
+}
+
+static inline u16 rvc_ldsp(u8 rd, u32 imm9)
+{
+       u32 imm;
+
+       imm = ((imm9 & 0x1c0) >> 6) | (imm9 & 0x38);
+       return rv_ci_insn(0x3, imm, rd, 0x2);
+}
+
+static inline u16 rvc_sdsp(u32 imm9, u8 rs2)
+{
+       u32 imm;
+
+       imm = (imm9 & 0x38) | ((imm9 & 0x1c0) >> 6);
+       return rv_css_insn(0x7, imm, rs2, 0x2);
+}
+
+#endif /* __riscv_xlen == 64 */
+
+/* Helper functions that emit RVC instructions when possible. */
+
+static inline void emit_jalr(u8 rd, u8 rs, s32 imm, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && rd == RV_REG_RA && rs && !imm)
+               emitc(rvc_jalr(rs), ctx);
+       else if (rvc_enabled() && !rd && rs && !imm)
+               emitc(rvc_jr(rs), ctx);
+       else
+               emit(rv_jalr(rd, rs, imm), ctx);
+}
+
+static inline void emit_mv(u8 rd, u8 rs, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && rd && rs)
+               emitc(rvc_mv(rd, rs), ctx);
+       else
+               emit(rv_addi(rd, rs, 0), ctx);
+}
+
+static inline void emit_add(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && rd && rd == rs1 && rs2)
+               emitc(rvc_add(rd, rs2), ctx);
+       else
+               emit(rv_add(rd, rs1, rs2), ctx);
+}
+
+static inline void emit_addi(u8 rd, u8 rs, s32 imm, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && rd == RV_REG_SP && rd == rs && is_10b_int(imm) && imm && !(imm & 0xf))
+               emitc(rvc_addi16sp(imm), ctx);
+       else if (rvc_enabled() && is_creg(rd) && rs == RV_REG_SP && is_10b_uint(imm) &&
+                !(imm & 0x3) && imm)
+               emitc(rvc_addi4spn(rd, imm), ctx);
+       else if (rvc_enabled() && rd && rd == rs && imm && is_6b_int(imm))
+               emitc(rvc_addi(rd, imm), ctx);
+       else
+               emit(rv_addi(rd, rs, imm), ctx);
+}
+
+static inline void emit_li(u8 rd, s32 imm, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && rd && is_6b_int(imm))
+               emitc(rvc_li(rd, imm), ctx);
+       else
+               emit(rv_addi(rd, RV_REG_ZERO, imm), ctx);
+}
+
+static inline void emit_lui(u8 rd, s32 imm, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && rd && rd != RV_REG_SP && is_6b_int(imm) && imm)
+               emitc(rvc_lui(rd, imm), ctx);
+       else
+               emit(rv_lui(rd, imm), ctx);
+}
+
+static inline void emit_slli(u8 rd, u8 rs, s32 imm, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && rd && rd == rs && imm && (u32)imm < __riscv_xlen)
+               emitc(rvc_slli(rd, imm), ctx);
+       else
+               emit(rv_slli(rd, rs, imm), ctx);
+}
+
+static inline void emit_andi(u8 rd, u8 rs, s32 imm, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && is_creg(rd) && rd == rs && is_6b_int(imm))
+               emitc(rvc_andi(rd, imm), ctx);
+       else
+               emit(rv_andi(rd, rs, imm), ctx);
+}
+
+static inline void emit_srli(u8 rd, u8 rs, s32 imm, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && is_creg(rd) && rd == rs && imm && (u32)imm < __riscv_xlen)
+               emitc(rvc_srli(rd, imm), ctx);
+       else
+               emit(rv_srli(rd, rs, imm), ctx);
+}
+
+static inline void emit_srai(u8 rd, u8 rs, s32 imm, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && is_creg(rd) && rd == rs && imm && (u32)imm < __riscv_xlen)
+               emitc(rvc_srai(rd, imm), ctx);
+       else
+               emit(rv_srai(rd, rs, imm), ctx);
+}
+
+static inline void emit_sub(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && is_creg(rd) && rd == rs1 && is_creg(rs2))
+               emitc(rvc_sub(rd, rs2), ctx);
+       else
+               emit(rv_sub(rd, rs1, rs2), ctx);
+}
+
+static inline void emit_or(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && is_creg(rd) && rd == rs1 && is_creg(rs2))
+               emitc(rvc_or(rd, rs2), ctx);
+       else
+               emit(rv_or(rd, rs1, rs2), ctx);
+}
+
+static inline void emit_and(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && is_creg(rd) && rd == rs1 && is_creg(rs2))
+               emitc(rvc_and(rd, rs2), ctx);
+       else
+               emit(rv_and(rd, rs1, rs2), ctx);
+}
+
+static inline void emit_xor(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && is_creg(rd) && rd == rs1 && is_creg(rs2))
+               emitc(rvc_xor(rd, rs2), ctx);
+       else
+               emit(rv_xor(rd, rs1, rs2), ctx);
+}
+
+static inline void emit_lw(u8 rd, s32 off, u8 rs1, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && rs1 == RV_REG_SP && rd && is_8b_uint(off) && !(off & 0x3))
+               emitc(rvc_lwsp(rd, off), ctx);
+       else if (rvc_enabled() && is_creg(rd) && is_creg(rs1) && is_7b_uint(off) && !(off & 0x3))
+               emitc(rvc_lw(rd, off, rs1), ctx);
+       else
+               emit(rv_lw(rd, off, rs1), ctx);
+}
+
+static inline void emit_sw(u8 rs1, s32 off, u8 rs2, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && rs1 == RV_REG_SP && is_8b_uint(off) && !(off & 0x3))
+               emitc(rvc_swsp(off, rs2), ctx);
+       else if (rvc_enabled() && is_creg(rs1) && is_creg(rs2) && is_7b_uint(off) && !(off & 0x3))
+               emitc(rvc_sw(rs1, off, rs2), ctx);
+       else
+               emit(rv_sw(rs1, off, rs2), ctx);
+}
+
+/* RV64-only helper functions. */
+#if __riscv_xlen == 64
+
+static inline void emit_addiw(u8 rd, u8 rs, s32 imm, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && rd && rd == rs && is_6b_int(imm))
+               emitc(rvc_addiw(rd, imm), ctx);
+       else
+               emit(rv_addiw(rd, rs, imm), ctx);
+}
+
+static inline void emit_ld(u8 rd, s32 off, u8 rs1, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && rs1 == RV_REG_SP && rd && is_9b_uint(off) && !(off & 0x7))
+               emitc(rvc_ldsp(rd, off), ctx);
+       else if (rvc_enabled() && is_creg(rd) && is_creg(rs1) && is_8b_uint(off) && !(off & 0x7))
+               emitc(rvc_ld(rd, off, rs1), ctx);
+       else
+               emit(rv_ld(rd, off, rs1), ctx);
+}
+
+static inline void emit_sd(u8 rs1, s32 off, u8 rs2, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && rs1 == RV_REG_SP && is_9b_uint(off) && !(off & 0x7))
+               emitc(rvc_sdsp(off, rs2), ctx);
+       else if (rvc_enabled() && is_creg(rs1) && is_creg(rs2) && is_8b_uint(off) && !(off & 0x7))
+               emitc(rvc_sd(rs1, off, rs2), ctx);
+       else
+               emit(rv_sd(rs1, off, rs2), ctx);
+}
+
+static inline void emit_subw(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx)
+{
+       if (rvc_enabled() && is_creg(rd) && rd == rs1 && is_creg(rs2))
+               emitc(rvc_subw(rd, rs2), ctx);
+       else
+               emit(rv_subw(rd, rs1, rs2), ctx);
+}
+
 #endif /* __riscv_xlen == 64 */
 
 void bpf_jit_build_prologue(struct rv_jit_context *ctx);
index b198eaa..bc5f220 100644 (file)
@@ -644,7 +644,7 @@ static int emit_branch_r64(const s8 *src1, const s8 *src2, s32 rvoff,
 
        e = ctx->ninsns;
        /* Adjust for extra insns. */
-       rvoff -= (e - s) << 2;
+       rvoff -= ninsns_rvoff(e - s);
        emit_jump_and_link(RV_REG_ZERO, rvoff, true, ctx);
        return 0;
 }
@@ -713,7 +713,7 @@ static int emit_bcc(u8 op, u8 rd, u8 rs, int rvoff, struct rv_jit_context *ctx)
        if (far) {
                e = ctx->ninsns;
                /* Adjust for extra insns. */
-               rvoff -= (e - s) << 2;
+               rvoff -= ninsns_rvoff(e - s);
                emit_jump_and_link(RV_REG_ZERO, rvoff, true, ctx);
        }
        return 0;
@@ -731,7 +731,7 @@ static int emit_branch_r32(const s8 *src1, const s8 *src2, s32 rvoff,
 
        e = ctx->ninsns;
        /* Adjust for extra insns. */
-       rvoff -= (e - s) << 2;
+       rvoff -= ninsns_rvoff(e - s);
 
        if (emit_bcc(op, lo(rs1), lo(rs2), rvoff, ctx))
                return -1;
@@ -795,7 +795,7 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
         * if (index >= max_entries)
         *   goto out;
         */
-       off = (tc_ninsn - (ctx->ninsns - start_insn)) << 2;
+       off = ninsns_rvoff(tc_ninsn - (ctx->ninsns - start_insn));
        emit_bcc(BPF_JGE, lo(idx_reg), RV_REG_T1, off, ctx);
 
        /*
@@ -804,7 +804,7 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
         *   goto out;
         */
        emit(rv_addi(RV_REG_T1, RV_REG_TCC, -1), ctx);
-       off = (tc_ninsn - (ctx->ninsns - start_insn)) << 2;
+       off = ninsns_rvoff(tc_ninsn - (ctx->ninsns - start_insn));
        emit_bcc(BPF_JSLT, RV_REG_TCC, RV_REG_ZERO, off, ctx);
 
        /*
@@ -818,7 +818,7 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
        if (is_12b_check(off, insn))
                return -1;
        emit(rv_lw(RV_REG_T0, off, RV_REG_T0), ctx);
-       off = (tc_ninsn - (ctx->ninsns - start_insn)) << 2;
+       off = ninsns_rvoff(tc_ninsn - (ctx->ninsns - start_insn));
        emit_bcc(BPF_JEQ, RV_REG_T0, RV_REG_ZERO, off, ctx);
 
        /*
@@ -1214,7 +1214,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
                        emit_imm32(tmp2, imm, ctx);
                        src = tmp2;
                        e = ctx->ninsns;
-                       rvoff -= (e - s) << 2;
+                       rvoff -= ninsns_rvoff(e - s);
                }
 
                if (is64)
index 6cfd164..8a56b52 100644 (file)
@@ -132,19 +132,23 @@ static void emit_imm(u8 rd, s64 val, struct rv_jit_context *ctx)
         *
         * This also means that we need to process LSB to MSB.
         */
-       s64 upper = (val + (1 << 11)) >> 12, lower = val & 0xfff;
+       s64 upper = (val + (1 << 11)) >> 12;
+       /* Sign-extend lower 12 bits to 64 bits since immediates for li, addiw,
+        * and addi are signed and RVC checks will perform signed comparisons.
+        */
+       s64 lower = ((val & 0xfff) << 52) >> 52;
        int shift;
 
        if (is_32b_int(val)) {
                if (upper)
-                       emit(rv_lui(rd, upper), ctx);
+                       emit_lui(rd, upper, ctx);
 
                if (!upper) {
-                       emit(rv_addi(rd, RV_REG_ZERO, lower), ctx);
+                       emit_li(rd, lower, ctx);
                        return;
                }
 
-               emit(rv_addiw(rd, rd, lower), ctx);
+               emit_addiw(rd, rd, lower, ctx);
                return;
        }
 
@@ -154,9 +158,9 @@ static void emit_imm(u8 rd, s64 val, struct rv_jit_context *ctx)
 
        emit_imm(rd, upper, ctx);
 
-       emit(rv_slli(rd, rd, shift), ctx);
+       emit_slli(rd, rd, shift, ctx);
        if (lower)
-               emit(rv_addi(rd, rd, lower), ctx);
+               emit_addi(rd, rd, lower, ctx);
 }
 
 static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx)
@@ -164,43 +168,43 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx)
        int stack_adjust = ctx->stack_size, store_offset = stack_adjust - 8;
 
        if (seen_reg(RV_REG_RA, ctx)) {
-               emit(rv_ld(RV_REG_RA, store_offset, RV_REG_SP), ctx);
+               emit_ld(RV_REG_RA, store_offset, RV_REG_SP, ctx);
                store_offset -= 8;
        }
-       emit(rv_ld(RV_REG_FP, store_offset, RV_REG_SP), ctx);
+       emit_ld(RV_REG_FP, store_offset, RV_REG_SP, ctx);
        store_offset -= 8;
        if (seen_reg(RV_REG_S1, ctx)) {
-               emit(rv_ld(RV_REG_S1, store_offset, RV_REG_SP), ctx);
+               emit_ld(RV_REG_S1, store_offset, RV_REG_SP, ctx);
                store_offset -= 8;
        }
        if (seen_reg(RV_REG_S2, ctx)) {
-               emit(rv_ld(RV_REG_S2, store_offset, RV_REG_SP), ctx);
+               emit_ld(RV_REG_S2, store_offset, RV_REG_SP, ctx);
                store_offset -= 8;
        }
        if (seen_reg(RV_REG_S3, ctx)) {
-               emit(rv_ld(RV_REG_S3, store_offset, RV_REG_SP), ctx);
+               emit_ld(RV_REG_S3, store_offset, RV_REG_SP, ctx);
                store_offset -= 8;
        }
        if (seen_reg(RV_REG_S4, ctx)) {
-               emit(rv_ld(RV_REG_S4, store_offset, RV_REG_SP), ctx);
+               emit_ld(RV_REG_S4, store_offset, RV_REG_SP, ctx);
                store_offset -= 8;
        }
        if (seen_reg(RV_REG_S5, ctx)) {
-               emit(rv_ld(RV_REG_S5, store_offset, RV_REG_SP), ctx);
+               emit_ld(RV_REG_S5, store_offset, RV_REG_SP, ctx);
                store_offset -= 8;
        }
        if (seen_reg(RV_REG_S6, ctx)) {
-               emit(rv_ld(RV_REG_S6, store_offset, RV_REG_SP), ctx);
+               emit_ld(RV_REG_S6, store_offset, RV_REG_SP, ctx);
                store_offset -= 8;
        }
 
-       emit(rv_addi(RV_REG_SP, RV_REG_SP, stack_adjust), ctx);
+       emit_addi(RV_REG_SP, RV_REG_SP, stack_adjust, ctx);
        /* Set return value. */
        if (!is_tail_call)
-               emit(rv_addi(RV_REG_A0, RV_REG_A5, 0), ctx);
-       emit(rv_jalr(RV_REG_ZERO, is_tail_call ? RV_REG_T3 : RV_REG_RA,
-                    is_tail_call ? 4 : 0), /* skip TCC init */
-            ctx);
+               emit_mv(RV_REG_A0, RV_REG_A5, ctx);
+       emit_jalr(RV_REG_ZERO, is_tail_call ? RV_REG_T3 : RV_REG_RA,
+                 is_tail_call ? 4 : 0, /* skip TCC init */
+                 ctx);
 }
 
 static void emit_bcc(u8 cond, u8 rd, u8 rs, int rvoff,
@@ -280,8 +284,8 @@ static void emit_branch(u8 cond, u8 rd, u8 rs, int rvoff,
 
 static void emit_zext_32(u8 reg, struct rv_jit_context *ctx)
 {
-       emit(rv_slli(reg, reg, 32), ctx);
-       emit(rv_srli(reg, reg, 32), ctx);
+       emit_slli(reg, reg, 32, ctx);
+       emit_srli(reg, reg, 32, ctx);
 }
 
 static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
@@ -304,35 +308,35 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx)
        if (is_12b_check(off, insn))
                return -1;
        emit(rv_lwu(RV_REG_T1, off, RV_REG_A1), ctx);
-       off = (tc_ninsn - (ctx->ninsns - start_insn)) << 2;
+       off = ninsns_rvoff(tc_ninsn - (ctx->ninsns - start_insn));
        emit_branch(BPF_JGE, RV_REG_A2, RV_REG_T1, off, ctx);
 
        /* if (TCC-- < 0)
         *     goto out;
         */
-       emit(rv_addi(RV_REG_T1, tcc, -1), ctx);
-       off = (tc_ninsn - (ctx->ninsns - start_insn)) << 2;
+       emit_addi(RV_REG_T1, tcc, -1, ctx);
+       off = ninsns_rvoff(tc_ninsn - (ctx->ninsns - start_insn));
        emit_branch(BPF_JSLT, tcc, RV_REG_ZERO, off, ctx);
 
        /* prog = array->ptrs[index];
         * if (!prog)
         *     goto out;
         */
-       emit(rv_slli(RV_REG_T2, RV_REG_A2, 3), ctx);
-       emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_A1), ctx);
+       emit_slli(RV_REG_T2, RV_REG_A2, 3, ctx);
+       emit_add(RV_REG_T2, RV_REG_T2, RV_REG_A1, ctx);
        off = offsetof(struct bpf_array, ptrs);
        if (is_12b_check(off, insn))
                return -1;
-       emit(rv_ld(RV_REG_T2, off, RV_REG_T2), ctx);
-       off = (tc_ninsn - (ctx->ninsns - start_insn)) << 2;
+       emit_ld(RV_REG_T2, off, RV_REG_T2, ctx);
+       off = ninsns_rvoff(tc_ninsn - (ctx->ninsns - start_insn));
        emit_branch(BPF_JEQ, RV_REG_T2, RV_REG_ZERO, off, ctx);
 
        /* goto *(prog->bpf_func + 4); */
        off = offsetof(struct bpf_prog, bpf_func);
        if (is_12b_check(off, insn))
                return -1;
-       emit(rv_ld(RV_REG_T3, off, RV_REG_T2), ctx);
-       emit(rv_addi(RV_REG_TCC, RV_REG_T1, 0), ctx);
+       emit_ld(RV_REG_T3, off, RV_REG_T2, ctx);
+       emit_mv(RV_REG_TCC, RV_REG_T1, ctx);
        __build_epilogue(true, ctx);
        return 0;
 }
@@ -360,9 +364,9 @@ static void init_regs(u8 *rd, u8 *rs, const struct bpf_insn *insn,
 
 static void emit_zext_32_rd_rs(u8 *rd, u8 *rs, struct rv_jit_context *ctx)
 {
-       emit(rv_addi(RV_REG_T2, *rd, 0), ctx);
+       emit_mv(RV_REG_T2, *rd, ctx);
        emit_zext_32(RV_REG_T2, ctx);
-       emit(rv_addi(RV_REG_T1, *rs, 0), ctx);
+       emit_mv(RV_REG_T1, *rs, ctx);
        emit_zext_32(RV_REG_T1, ctx);
        *rd = RV_REG_T2;
        *rs = RV_REG_T1;
@@ -370,15 +374,15 @@ static void emit_zext_32_rd_rs(u8 *rd, u8 *rs, struct rv_jit_context *ctx)
 
 static void emit_sext_32_rd_rs(u8 *rd, u8 *rs, struct rv_jit_context *ctx)
 {
-       emit(rv_addiw(RV_REG_T2, *rd, 0), ctx);
-       emit(rv_addiw(RV_REG_T1, *rs, 0), ctx);
+       emit_addiw(RV_REG_T2, *rd, 0, ctx);
+       emit_addiw(RV_REG_T1, *rs, 0, ctx);
        *rd = RV_REG_T2;
        *rs = RV_REG_T1;
 }
 
 static void emit_zext_32_rd_t1(u8 *rd, struct rv_jit_context *ctx)
 {
-       emit(rv_addi(RV_REG_T2, *rd, 0), ctx);
+       emit_mv(RV_REG_T2, *rd, ctx);
        emit_zext_32(RV_REG_T2, ctx);
        emit_zext_32(RV_REG_T1, ctx);
        *rd = RV_REG_T2;
@@ -386,7 +390,7 @@ static void emit_zext_32_rd_t1(u8 *rd, struct rv_jit_context *ctx)
 
 static void emit_sext_32_rd(u8 *rd, struct rv_jit_context *ctx)
 {
-       emit(rv_addiw(RV_REG_T2, *rd, 0), ctx);
+       emit_addiw(RV_REG_T2, *rd, 0, ctx);
        *rd = RV_REG_T2;
 }
 
@@ -432,7 +436,7 @@ static int emit_call(bool fixed, u64 addr, struct rv_jit_context *ctx)
        if (ret)
                return ret;
        rd = bpf_to_rv_reg(BPF_REG_0, ctx);
-       emit(rv_addi(rd, RV_REG_A0, 0), ctx);
+       emit_mv(rd, RV_REG_A0, ctx);
        return 0;
 }
 
@@ -458,7 +462,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
                        emit_zext_32(rd, ctx);
                        break;
                }
-               emit(is64 ? rv_addi(rd, rs, 0) : rv_addiw(rd, rs, 0), ctx);
+               emit_mv(rd, rs, ctx);
                if (!is64 && !aux->verifier_zext)
                        emit_zext_32(rd, ctx);
                break;
@@ -466,31 +470,35 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
        /* dst = dst OP src */
        case BPF_ALU | BPF_ADD | BPF_X:
        case BPF_ALU64 | BPF_ADD | BPF_X:
-               emit(is64 ? rv_add(rd, rd, rs) : rv_addw(rd, rd, rs), ctx);
+               emit_add(rd, rd, rs, ctx);
                if (!is64 && !aux->verifier_zext)
                        emit_zext_32(rd, ctx);
                break;
        case BPF_ALU | BPF_SUB | BPF_X:
        case BPF_ALU64 | BPF_SUB | BPF_X:
-               emit(is64 ? rv_sub(rd, rd, rs) : rv_subw(rd, rd, rs), ctx);
+               if (is64)
+                       emit_sub(rd, rd, rs, ctx);
+               else
+                       emit_subw(rd, rd, rs, ctx);
+
                if (!is64 && !aux->verifier_zext)
                        emit_zext_32(rd, ctx);
                break;
        case BPF_ALU | BPF_AND | BPF_X:
        case BPF_ALU64 | BPF_AND | BPF_X:
-               emit(rv_and(rd, rd, rs), ctx);
+               emit_and(rd, rd, rs, ctx);
                if (!is64 && !aux->verifier_zext)
                        emit_zext_32(rd, ctx);
                break;
        case BPF_ALU | BPF_OR | BPF_X:
        case BPF_ALU64 | BPF_OR | BPF_X:
-               emit(rv_or(rd, rd, rs), ctx);
+               emit_or(rd, rd, rs, ctx);
                if (!is64 && !aux->verifier_zext)
                        emit_zext_32(rd, ctx);
                break;
        case BPF_ALU | BPF_XOR | BPF_X:
        case BPF_ALU64 | BPF_XOR | BPF_X:
-               emit(rv_xor(rd, rd, rs), ctx);
+               emit_xor(rd, rd, rs, ctx);
                if (!is64 && !aux->verifier_zext)
                        emit_zext_32(rd, ctx);
                break;
@@ -534,8 +542,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
        /* dst = -dst */
        case BPF_ALU | BPF_NEG:
        case BPF_ALU64 | BPF_NEG:
-               emit(is64 ? rv_sub(rd, RV_REG_ZERO, rd) :
-                    rv_subw(rd, RV_REG_ZERO, rd), ctx);
+               emit_sub(rd, RV_REG_ZERO, rd, ctx);
                if (!is64 && !aux->verifier_zext)
                        emit_zext_32(rd, ctx);
                break;
@@ -544,8 +551,8 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
        case BPF_ALU | BPF_END | BPF_FROM_LE:
                switch (imm) {
                case 16:
-                       emit(rv_slli(rd, rd, 48), ctx);
-                       emit(rv_srli(rd, rd, 48), ctx);
+                       emit_slli(rd, rd, 48, ctx);
+                       emit_srli(rd, rd, 48, ctx);
                        break;
                case 32:
                        if (!aux->verifier_zext)
@@ -558,51 +565,51 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
                break;
 
        case BPF_ALU | BPF_END | BPF_FROM_BE:
-               emit(rv_addi(RV_REG_T2, RV_REG_ZERO, 0), ctx);
+               emit_li(RV_REG_T2, 0, ctx);
 
-               emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
-               emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
-               emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
-               emit(rv_srli(rd, rd, 8), ctx);
+               emit_andi(RV_REG_T1, rd, 0xff, ctx);
+               emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx);
+               emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx);
+               emit_srli(rd, rd, 8, ctx);
                if (imm == 16)
                        goto out_be;
 
-               emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
-               emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
-               emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
-               emit(rv_srli(rd, rd, 8), ctx);
+               emit_andi(RV_REG_T1, rd, 0xff, ctx);
+               emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx);
+               emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx);
+               emit_srli(rd, rd, 8, ctx);
 
-               emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
-               emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
-               emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
-               emit(rv_srli(rd, rd, 8), ctx);
+               emit_andi(RV_REG_T1, rd, 0xff, ctx);
+               emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx);
+               emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx);
+               emit_srli(rd, rd, 8, ctx);
                if (imm == 32)
                        goto out_be;
 
-               emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
-               emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
-               emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
-               emit(rv_srli(rd, rd, 8), ctx);
-
-               emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
-               emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
-               emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
-               emit(rv_srli(rd, rd, 8), ctx);
-
-               emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
-               emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
-               emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
-               emit(rv_srli(rd, rd, 8), ctx);
-
-               emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
-               emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
-               emit(rv_slli(RV_REG_T2, RV_REG_T2, 8), ctx);
-               emit(rv_srli(rd, rd, 8), ctx);
+               emit_andi(RV_REG_T1, rd, 0xff, ctx);
+               emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx);
+               emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx);
+               emit_srli(rd, rd, 8, ctx);
+
+               emit_andi(RV_REG_T1, rd, 0xff, ctx);
+               emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx);
+               emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx);
+               emit_srli(rd, rd, 8, ctx);
+
+               emit_andi(RV_REG_T1, rd, 0xff, ctx);
+               emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx);
+               emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx);
+               emit_srli(rd, rd, 8, ctx);
+
+               emit_andi(RV_REG_T1, rd, 0xff, ctx);
+               emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx);
+               emit_slli(RV_REG_T2, RV_REG_T2, 8, ctx);
+               emit_srli(rd, rd, 8, ctx);
 out_be:
-               emit(rv_andi(RV_REG_T1, rd, 0xff), ctx);
-               emit(rv_add(RV_REG_T2, RV_REG_T2, RV_REG_T1), ctx);
+               emit_andi(RV_REG_T1, rd, 0xff, ctx);
+               emit_add(RV_REG_T2, RV_REG_T2, RV_REG_T1, ctx);
 
-               emit(rv_addi(rd, RV_REG_T2, 0), ctx);
+               emit_mv(rd, RV_REG_T2, ctx);
                break;
 
        /* dst = imm */
@@ -617,12 +624,10 @@ out_be:
        case BPF_ALU | BPF_ADD | BPF_K:
        case BPF_ALU64 | BPF_ADD | BPF_K:
                if (is_12b_int(imm)) {
-                       emit(is64 ? rv_addi(rd, rd, imm) :
-                            rv_addiw(rd, rd, imm), ctx);
+                       emit_addi(rd, rd, imm, ctx);
                } else {
                        emit_imm(RV_REG_T1, imm, ctx);
-                       emit(is64 ? rv_add(rd, rd, RV_REG_T1) :
-                            rv_addw(rd, rd, RV_REG_T1), ctx);
+                       emit_add(rd, rd, RV_REG_T1, ctx);
                }
                if (!is64 && !aux->verifier_zext)
                        emit_zext_32(rd, ctx);
@@ -630,12 +635,10 @@ out_be:
        case BPF_ALU | BPF_SUB | BPF_K:
        case BPF_ALU64 | BPF_SUB | BPF_K:
                if (is_12b_int(-imm)) {
-                       emit(is64 ? rv_addi(rd, rd, -imm) :
-                            rv_addiw(rd, rd, -imm), ctx);
+                       emit_addi(rd, rd, -imm, ctx);
                } else {
                        emit_imm(RV_REG_T1, imm, ctx);
-                       emit(is64 ? rv_sub(rd, rd, RV_REG_T1) :
-                            rv_subw(rd, rd, RV_REG_T1), ctx);
+                       emit_sub(rd, rd, RV_REG_T1, ctx);
                }
                if (!is64 && !aux->verifier_zext)
                        emit_zext_32(rd, ctx);
@@ -643,10 +646,10 @@ out_be:
        case BPF_ALU | BPF_AND | BPF_K:
        case BPF_ALU64 | BPF_AND | BPF_K:
                if (is_12b_int(imm)) {
-                       emit(rv_andi(rd, rd, imm), ctx);
+                       emit_andi(rd, rd, imm, ctx);
                } else {
                        emit_imm(RV_REG_T1, imm, ctx);
-                       emit(rv_and(rd, rd, RV_REG_T1), ctx);
+                       emit_and(rd, rd, RV_REG_T1, ctx);
                }
                if (!is64 && !aux->verifier_zext)
                        emit_zext_32(rd, ctx);
@@ -657,7 +660,7 @@ out_be:
                        emit(rv_ori(rd, rd, imm), ctx);
                } else {
                        emit_imm(RV_REG_T1, imm, ctx);
-                       emit(rv_or(rd, rd, RV_REG_T1), ctx);
+                       emit_or(rd, rd, RV_REG_T1, ctx);
                }
                if (!is64 && !aux->verifier_zext)
                        emit_zext_32(rd, ctx);
@@ -668,7 +671,7 @@ out_be:
                        emit(rv_xori(rd, rd, imm), ctx);
                } else {
                        emit_imm(RV_REG_T1, imm, ctx);
-                       emit(rv_xor(rd, rd, RV_REG_T1), ctx);
+                       emit_xor(rd, rd, RV_REG_T1, ctx);
                }
                if (!is64 && !aux->verifier_zext)
                        emit_zext_32(rd, ctx);
@@ -699,19 +702,28 @@ out_be:
                break;
        case BPF_ALU | BPF_LSH | BPF_K:
        case BPF_ALU64 | BPF_LSH | BPF_K:
-               emit(is64 ? rv_slli(rd, rd, imm) : rv_slliw(rd, rd, imm), ctx);
+               emit_slli(rd, rd, imm, ctx);
+
                if (!is64 && !aux->verifier_zext)
                        emit_zext_32(rd, ctx);
                break;
        case BPF_ALU | BPF_RSH | BPF_K:
        case BPF_ALU64 | BPF_RSH | BPF_K:
-               emit(is64 ? rv_srli(rd, rd, imm) : rv_srliw(rd, rd, imm), ctx);
+               if (is64)
+                       emit_srli(rd, rd, imm, ctx);
+               else
+                       emit(rv_srliw(rd, rd, imm), ctx);
+
                if (!is64 && !aux->verifier_zext)
                        emit_zext_32(rd, ctx);
                break;
        case BPF_ALU | BPF_ARSH | BPF_K:
        case BPF_ALU64 | BPF_ARSH | BPF_K:
-               emit(is64 ? rv_srai(rd, rd, imm) : rv_sraiw(rd, rd, imm), ctx);
+               if (is64)
+                       emit_srai(rd, rd, imm, ctx);
+               else
+                       emit(rv_sraiw(rd, rd, imm), ctx);
+
                if (!is64 && !aux->verifier_zext)
                        emit_zext_32(rd, ctx);
                break;
@@ -757,13 +769,13 @@ out_be:
                        e = ctx->ninsns;
 
                        /* Adjust for extra insns */
-                       rvoff -= (e - s) << 2;
+                       rvoff -= ninsns_rvoff(e - s);
                }
 
                if (BPF_OP(code) == BPF_JSET) {
                        /* Adjust for and */
                        rvoff -= 4;
-                       emit(rv_and(RV_REG_T1, rd, rs), ctx);
+                       emit_and(RV_REG_T1, rd, rs, ctx);
                        emit_branch(BPF_JNE, RV_REG_T1, RV_REG_ZERO, rvoff,
                                    ctx);
                } else {
@@ -810,7 +822,7 @@ out_be:
                e = ctx->ninsns;
 
                /* Adjust for extra insns */
-               rvoff -= (e - s) << 2;
+               rvoff -= ninsns_rvoff(e - s);
                emit_branch(BPF_OP(code), rd, rs, rvoff, ctx);
                break;
 
@@ -819,19 +831,19 @@ out_be:
                rvoff = rv_offset(i, off, ctx);
                s = ctx->ninsns;
                if (is_12b_int(imm)) {
-                       emit(rv_andi(RV_REG_T1, rd, imm), ctx);
+                       emit_andi(RV_REG_T1, rd, imm, ctx);
                } else {
                        emit_imm(RV_REG_T1, imm, ctx);
-                       emit(rv_and(RV_REG_T1, rd, RV_REG_T1), ctx);
+                       emit_and(RV_REG_T1, rd, RV_REG_T1, ctx);
                }
                /* For jset32, we should clear the upper 32 bits of t1, but
                 * sign-extension is sufficient here and saves one instruction,
                 * as t1 is used only in comparison against zero.
                 */
                if (!is64 && imm < 0)
-                       emit(rv_addiw(RV_REG_T1, RV_REG_T1, 0), ctx);
+                       emit_addiw(RV_REG_T1, RV_REG_T1, 0, ctx);
                e = ctx->ninsns;
-               rvoff -= (e - s) << 2;
+               rvoff -= ninsns_rvoff(e - s);
                emit_branch(BPF_JNE, RV_REG_T1, RV_REG_ZERO, rvoff, ctx);
                break;
 
@@ -887,7 +899,7 @@ out_be:
                }
 
                emit_imm(RV_REG_T1, off, ctx);
-               emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
+               emit_add(RV_REG_T1, RV_REG_T1, rs, ctx);
                emit(rv_lbu(rd, 0, RV_REG_T1), ctx);
                if (insn_is_zext(&insn[1]))
                        return 1;
@@ -899,7 +911,7 @@ out_be:
                }
 
                emit_imm(RV_REG_T1, off, ctx);
-               emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
+               emit_add(RV_REG_T1, RV_REG_T1, rs, ctx);
                emit(rv_lhu(rd, 0, RV_REG_T1), ctx);
                if (insn_is_zext(&insn[1]))
                        return 1;
@@ -911,20 +923,20 @@ out_be:
                }
 
                emit_imm(RV_REG_T1, off, ctx);
-               emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
+               emit_add(RV_REG_T1, RV_REG_T1, rs, ctx);
                emit(rv_lwu(rd, 0, RV_REG_T1), ctx);
                if (insn_is_zext(&insn[1]))
                        return 1;
                break;
        case BPF_LDX | BPF_MEM | BPF_DW:
                if (is_12b_int(off)) {
-                       emit(rv_ld(rd, off, rs), ctx);
+                       emit_ld(rd, off, rs, ctx);
                        break;
                }
 
                emit_imm(RV_REG_T1, off, ctx);
-               emit(rv_add(RV_REG_T1, RV_REG_T1, rs), ctx);
-               emit(rv_ld(rd, 0, RV_REG_T1), ctx);
+               emit_add(RV_REG_T1, RV_REG_T1, rs, ctx);
+               emit_ld(rd, 0, RV_REG_T1, ctx);
                break;
 
        /* ST: *(size *)(dst + off) = imm */
@@ -936,7 +948,7 @@ out_be:
                }
 
                emit_imm(RV_REG_T2, off, ctx);
-               emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
+               emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
                emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx);
                break;
 
@@ -948,30 +960,30 @@ out_be:
                }
 
                emit_imm(RV_REG_T2, off, ctx);
-               emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
+               emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
                emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx);
                break;
        case BPF_ST | BPF_MEM | BPF_W:
                emit_imm(RV_REG_T1, imm, ctx);
                if (is_12b_int(off)) {
-                       emit(rv_sw(rd, off, RV_REG_T1), ctx);
+                       emit_sw(rd, off, RV_REG_T1, ctx);
                        break;
                }
 
                emit_imm(RV_REG_T2, off, ctx);
-               emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
-               emit(rv_sw(RV_REG_T2, 0, RV_REG_T1), ctx);
+               emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
+               emit_sw(RV_REG_T2, 0, RV_REG_T1, ctx);
                break;
        case BPF_ST | BPF_MEM | BPF_DW:
                emit_imm(RV_REG_T1, imm, ctx);
                if (is_12b_int(off)) {
-                       emit(rv_sd(rd, off, RV_REG_T1), ctx);
+                       emit_sd(rd, off, RV_REG_T1, ctx);
                        break;
                }
 
                emit_imm(RV_REG_T2, off, ctx);
-               emit(rv_add(RV_REG_T2, RV_REG_T2, rd), ctx);
-               emit(rv_sd(RV_REG_T2, 0, RV_REG_T1), ctx);
+               emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
+               emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx);
                break;
 
        /* STX: *(size *)(dst + off) = src */
@@ -982,7 +994,7 @@ out_be:
                }
 
                emit_imm(RV_REG_T1, off, ctx);
-               emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+               emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
                emit(rv_sb(RV_REG_T1, 0, rs), ctx);
                break;
        case BPF_STX | BPF_MEM | BPF_H:
@@ -992,28 +1004,28 @@ out_be:
                }
 
                emit_imm(RV_REG_T1, off, ctx);
-               emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+               emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
                emit(rv_sh(RV_REG_T1, 0, rs), ctx);
                break;
        case BPF_STX | BPF_MEM | BPF_W:
                if (is_12b_int(off)) {
-                       emit(rv_sw(rd, off, rs), ctx);
+                       emit_sw(rd, off, rs, ctx);
                        break;
                }
 
                emit_imm(RV_REG_T1, off, ctx);
-               emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
-               emit(rv_sw(RV_REG_T1, 0, rs), ctx);
+               emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
+               emit_sw(RV_REG_T1, 0, rs, ctx);
                break;
        case BPF_STX | BPF_MEM | BPF_DW:
                if (is_12b_int(off)) {
-                       emit(rv_sd(rd, off, rs), ctx);
+                       emit_sd(rd, off, rs, ctx);
                        break;
                }
 
                emit_imm(RV_REG_T1, off, ctx);
-               emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
-               emit(rv_sd(RV_REG_T1, 0, rs), ctx);
+               emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
+               emit_sd(RV_REG_T1, 0, rs, ctx);
                break;
        /* STX XADD: lock *(u32 *)(dst + off) += src */
        case BPF_STX | BPF_XADD | BPF_W:
@@ -1021,10 +1033,10 @@ out_be:
        case BPF_STX | BPF_XADD | BPF_DW:
                if (off) {
                        if (is_12b_int(off)) {
-                               emit(rv_addi(RV_REG_T1, rd, off), ctx);
+                               emit_addi(RV_REG_T1, rd, off, ctx);
                        } else {
                                emit_imm(RV_REG_T1, off, ctx);
-                               emit(rv_add(RV_REG_T1, RV_REG_T1, rd), ctx);
+                               emit_add(RV_REG_T1, RV_REG_T1, rd, ctx);
                        }
 
                        rd = RV_REG_T1;
@@ -1073,52 +1085,53 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx)
 
        /* First instruction is always setting the tail-call-counter
         * (TCC) register. This instruction is skipped for tail calls.
+        * Force using a 4-byte (non-compressed) instruction.
         */
        emit(rv_addi(RV_REG_TCC, RV_REG_ZERO, MAX_TAIL_CALL_CNT), ctx);
 
-       emit(rv_addi(RV_REG_SP, RV_REG_SP, -stack_adjust), ctx);
+       emit_addi(RV_REG_SP, RV_REG_SP, -stack_adjust, ctx);
 
        if (seen_reg(RV_REG_RA, ctx)) {
-               emit(rv_sd(RV_REG_SP, store_offset, RV_REG_RA), ctx);
+               emit_sd(RV_REG_SP, store_offset, RV_REG_RA, ctx);
                store_offset -= 8;
        }
-       emit(rv_sd(RV_REG_SP, store_offset, RV_REG_FP), ctx);
+       emit_sd(RV_REG_SP, store_offset, RV_REG_FP, ctx);
        store_offset -= 8;
        if (seen_reg(RV_REG_S1, ctx)) {
-               emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S1), ctx);
+               emit_sd(RV_REG_SP, store_offset, RV_REG_S1, ctx);
                store_offset -= 8;
        }
        if (seen_reg(RV_REG_S2, ctx)) {
-               emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S2), ctx);
+               emit_sd(RV_REG_SP, store_offset, RV_REG_S2, ctx);
                store_offset -= 8;
        }
        if (seen_reg(RV_REG_S3, ctx)) {
-               emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S3), ctx);
+               emit_sd(RV_REG_SP, store_offset, RV_REG_S3, ctx);
                store_offset -= 8;
        }
        if (seen_reg(RV_REG_S4, ctx)) {
-               emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S4), ctx);
+               emit_sd(RV_REG_SP, store_offset, RV_REG_S4, ctx);
                store_offset -= 8;
        }
        if (seen_reg(RV_REG_S5, ctx)) {
-               emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S5), ctx);
+               emit_sd(RV_REG_SP, store_offset, RV_REG_S5, ctx);
                store_offset -= 8;
        }
        if (seen_reg(RV_REG_S6, ctx)) {
-               emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S6), ctx);
+               emit_sd(RV_REG_SP, store_offset, RV_REG_S6, ctx);
                store_offset -= 8;
        }
 
-       emit(rv_addi(RV_REG_FP, RV_REG_SP, stack_adjust), ctx);
+       emit_addi(RV_REG_FP, RV_REG_SP, stack_adjust, ctx);
 
        if (bpf_stack_adjust)
-               emit(rv_addi(RV_REG_S5, RV_REG_SP, bpf_stack_adjust), ctx);
+               emit_addi(RV_REG_S5, RV_REG_SP, bpf_stack_adjust, ctx);
 
        /* Program contains calls and tail calls, so RV_REG_TCC need
         * to be saved across calls.
         */
        if (seen_tail_call(ctx) && seen_call(ctx))
-               emit(rv_addi(RV_REG_TCC_SAVED, RV_REG_TCC, 0), ctx);
+               emit_mv(RV_REG_TCC_SAVED, RV_REG_TCC, ctx);
 
        ctx->stack_size = stack_adjust;
 }
index 709b94e..3630d44 100644 (file)
@@ -73,7 +73,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 
        if (ctx->offset) {
                extra_pass = true;
-               image_size = sizeof(u32) * ctx->ninsns;
+               image_size = sizeof(*ctx->insns) * ctx->ninsns;
                goto skip_init_ctx;
        }
 
@@ -103,7 +103,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
                        if (jit_data->header)
                                break;
 
-                       image_size = sizeof(u32) * ctx->ninsns;
+                       image_size = sizeof(*ctx->insns) * ctx->ninsns;
                        jit_data->header =
                                bpf_jit_binary_alloc(image_size,
                                                     &jit_data->image,
@@ -114,7 +114,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
                                goto out_offset;
                        }
 
-                       ctx->insns = (u32 *)jit_data->image;
+                       ctx->insns = (u16 *)jit_data->image;
                        /*
                         * Now, when the image is allocated, the image can
                         * potentially shrink more (auipc/jalr -> jal).
index f4242b8..26f97a1 100644 (file)
@@ -489,6 +489,24 @@ static void save_restore_regs(struct bpf_jit *jit, int op, u32 stack_depth)
        } while (re <= last);
 }
 
+static void bpf_skip(struct bpf_jit *jit, int size)
+{
+       if (size >= 6 && !is_valid_rel(size)) {
+               /* brcl 0xf,size */
+               EMIT6_PCREL_RIL(0xc0f4000000, size);
+               size -= 6;
+       } else if (size >= 4 && is_valid_rel(size)) {
+               /* brc 0xf,size */
+               EMIT4_PCREL(0xa7f40000, size);
+               size -= 4;
+       }
+       while (size >= 2) {
+               /* bcr 0,%0 */
+               _EMIT2(0x0700);
+               size -= 2;
+       }
+}
+
 /*
  * Emit function prologue
  *
@@ -501,10 +519,11 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
                /* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */
                _EMIT6(0xd703f000 | STK_OFF_TCCNT, 0xf000 | STK_OFF_TCCNT);
        } else {
-               /* j tail_call_start: NOP if no tail calls are used */
-               EMIT4_PCREL(0xa7f40000, 6);
-               /* bcr 0,%0 */
-               EMIT2(0x0700, 0, REG_0);
+               /*
+                * There are no tail calls. Insert nops in order to have
+                * tail_call_start at a predictable offset.
+                */
+               bpf_skip(jit, 6);
        }
        /* Tail calls have to skip above initialization */
        jit->tail_call_start = jit->prg;
@@ -1268,8 +1287,12 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
                last = (i == fp->len - 1) ? 1 : 0;
                if (last)
                        break;
-               /* j <exit> */
-               EMIT4_PCREL(0xa7f40000, jit->exit_ip - jit->prg);
+               if (!is_first_pass(jit) && can_use_rel(jit, jit->exit_ip))
+                       /* brc 0xf, <exit> */
+                       EMIT4_PCREL_RIC(0xa7040000, 0xf, jit->exit_ip);
+               else
+                       /* brcl 0xf, <exit> */
+                       EMIT6_PCREL_RILC(0xc0040000, 0xf, jit->exit_ip);
                break;
        /*
         * Branch relative (number of skipped instructions) to offset on
@@ -1417,21 +1440,10 @@ branch_ks:
                }
                break;
 branch_ku:
-               is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
-               /* clfi or clgfi %dst,imm */
-               EMIT6_IMM(is_jmp32 ? 0xc20f0000 : 0xc20e0000,
-                         dst_reg, imm);
-               if (!is_first_pass(jit) &&
-                   can_use_rel(jit, addrs[i + off + 1])) {
-                       /* brc mask,off */
-                       EMIT4_PCREL_RIC(0xa7040000,
-                                       mask >> 12, addrs[i + off + 1]);
-               } else {
-                       /* brcl mask,off */
-                       EMIT6_PCREL_RILC(0xc0040000,
-                                        mask >> 12, addrs[i + off + 1]);
-               }
-               break;
+               /* lgfi %w1,imm (load sign extend imm) */
+               src_reg = REG_1;
+               EMIT6_IMM(0xc0010000, src_reg, imm);
+               goto branch_xu;
 branch_xs:
                is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
                if (!is_first_pass(jit) &&
@@ -1510,7 +1522,14 @@ static bool bpf_is_new_addr_sane(struct bpf_jit *jit, int i)
  */
 static int bpf_set_addr(struct bpf_jit *jit, int i)
 {
-       if (!bpf_is_new_addr_sane(jit, i))
+       int delta;
+
+       if (is_codegen_pass(jit)) {
+               delta = jit->prg - jit->addrs[i];
+               if (delta < 0)
+                       bpf_skip(jit, -delta);
+       }
+       if (WARN_ON_ONCE(!bpf_is_new_addr_sane(jit, i)))
                return -1;
        jit->addrs[i] = jit->prg;
        return 0;
index 47d5b0c..722f799 100644 (file)
@@ -8,6 +8,7 @@
 enum netns_bpf_attach_type {
        NETNS_BPF_INVALID = -1,
        NETNS_BPF_FLOW_DISSECTOR = 0,
+       NETNS_BPF_SK_LOOKUP,
        MAX_NETNS_BPF_ATTACH_TYPE
 };
 
@@ -17,6 +18,8 @@ to_netns_bpf_attach_type(enum bpf_attach_type attach_type)
        switch (attach_type) {
        case BPF_FLOW_DISSECTOR:
                return NETNS_BPF_FLOW_DISSECTOR;
+       case BPF_SK_LOOKUP:
+               return NETNS_BPF_SK_LOOKUP;
        default:
                return NETNS_BPF_INVALID;
        }
index c67c88a..bae557f 100644 (file)
@@ -249,6 +249,7 @@ enum bpf_arg_type {
        ARG_PTR_TO_INT,         /* pointer to int */
        ARG_PTR_TO_LONG,        /* pointer to long */
        ARG_PTR_TO_SOCKET,      /* pointer to bpf_sock (fullsock) */
+       ARG_PTR_TO_SOCKET_OR_NULL,      /* pointer to bpf_sock (fullsock) or NULL */
        ARG_PTR_TO_BTF_ID,      /* pointer to in-kernel struct */
        ARG_PTR_TO_ALLOC_MEM,   /* pointer to dynamically allocated memory */
        ARG_PTR_TO_ALLOC_MEM_OR_NULL,   /* pointer to dynamically allocated memory or NULL */
@@ -667,6 +668,7 @@ struct bpf_jit_poke_descriptor {
 struct bpf_ctx_arg_aux {
        u32 offset;
        enum bpf_reg_type reg_type;
+       u32 btf_id;
 };
 
 struct bpf_prog_aux {
@@ -928,6 +930,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
 
 void bpf_prog_array_delete_safe(struct bpf_prog_array *progs,
                                struct bpf_prog *old_prog);
+int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index);
+int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
+                            struct bpf_prog *prog);
 int bpf_prog_array_copy_info(struct bpf_prog_array *array,
                             u32 *prog_ids, u32 request_cnt,
                             u32 *prog_cnt);
@@ -1272,6 +1277,7 @@ struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
 void __cpu_map_flush(void);
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
                    struct net_device *dev_rx);
+bool cpu_map_prog_allowed(struct bpf_map *map);
 
 /* Return map's numa specified by userspace */
 static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
@@ -1432,6 +1438,11 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
        return 0;
 }
 
+static inline bool cpu_map_prog_allowed(struct bpf_map *map)
+{
+       return false;
+}
+
 static inline struct bpf_prog *bpf_prog_get_type_path(const char *name,
                                enum bpf_prog_type type)
 {
@@ -1531,7 +1542,6 @@ static inline bool bpf_map_is_dev_bound(struct bpf_map *map)
 
 struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr);
 void bpf_map_offload_map_free(struct bpf_map *map);
-void init_btf_sock_ids(struct btf *btf);
 #else
 static inline int bpf_prog_offload_init(struct bpf_prog *prog,
                                        union bpf_attr *attr)
@@ -1557,9 +1567,6 @@ static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
 static inline void bpf_map_offload_map_free(struct bpf_map *map)
 {
 }
-static inline void init_btf_sock_ids(struct btf *btf)
-{
-}
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
 #if defined(CONFIG_BPF_STREAM_PARSER)
index a18ae82..a52a568 100644 (file)
@@ -64,6 +64,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
 #ifdef CONFIG_INET
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport,
              struct sk_reuseport_md, struct sk_reuseport_kern)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_LOOKUP, sk_lookup,
+             struct bpf_sk_lookup, struct bpf_sk_lookup_kern)
 #endif
 #if defined(CONFIG_BPF_JIT)
 BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops,
index 1cdb569..4867d54 100644 (file)
@@ -57,17 +57,20 @@ asm(                                                        \
  * .zero 4
  *
  */
-#define __BTF_ID_LIST(name)                            \
+#define __BTF_ID_LIST(name, scope)                     \
 asm(                                                   \
 ".pushsection " BTF_IDS_SECTION ",\"a\";       \n"     \
-".local " #name ";                             \n"     \
+"." #scope " " #name ";                        \n"     \
 #name ":;                                      \n"     \
 ".popsection;                                  \n");   \
 
 #define BTF_ID_LIST(name)                              \
-__BTF_ID_LIST(name)                                    \
+__BTF_ID_LIST(name, local)                             \
 extern u32 name[];
 
+#define BTF_ID_LIST_GLOBAL(name)                       \
+__BTF_ID_LIST(name, globl)
+
 /*
  * The BTF_ID_UNUSED macro defines 4 zero bytes.
  * It's used when we want to define 'unused' entry
@@ -90,7 +93,38 @@ asm(                                                 \
 #define BTF_ID_LIST(name) static u32 name[5];
 #define BTF_ID(prefix, name)
 #define BTF_ID_UNUSED
+#define BTF_ID_LIST_GLOBAL(name) u32 name[1];
 
 #endif /* CONFIG_DEBUG_INFO_BTF */
 
+#ifdef CONFIG_NET
+/* Define a list of socket types which can be the argument for
+ * skc_to_*_sock() helpers. All these sockets should have
+ * sock_common as the first argument in its memory layout.
+ */
+#define BTF_SOCK_TYPE_xxx \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET, inet_sock)                    \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_CONN, inet_connection_sock)    \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_REQ, inet_request_sock)        \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_TW, inet_timewait_sock)        \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_REQ, request_sock)                  \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK, sock)                         \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK_COMMON, sock_common)           \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP, tcp_sock)                      \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_REQ, tcp_request_sock)          \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, tcp_timewait_sock)          \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, tcp6_sock)                    \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock)                      \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock)
+
+enum {
+#define BTF_SOCK_TYPE(name, str) name,
+BTF_SOCK_TYPE_xxx
+#undef BTF_SOCK_TYPE
+MAX_BTF_SOCK_TYPE,
+};
+
+extern u32 btf_sock_ids[];
+#endif
+
 #endif
index 4d049c8..1c6b6d9 100644 (file)
@@ -1278,4 +1278,151 @@ struct bpf_sockopt_kern {
 
 int copy_bpf_fprog_from_user(struct sock_fprog *dst, void __user *src, int len);
 
+struct bpf_sk_lookup_kern {
+       u16             family;
+       u16             protocol;
+       struct {
+               __be32 saddr;
+               __be32 daddr;
+       } v4;
+       struct {
+               const struct in6_addr *saddr;
+               const struct in6_addr *daddr;
+       } v6;
+       __be16          sport;
+       u16             dport;
+       struct sock     *selected_sk;
+       bool            no_reuseport;
+};
+
+extern struct static_key_false bpf_sk_lookup_enabled;
+
+/* Runners for BPF_SK_LOOKUP programs to invoke on socket lookup.
+ *
+ * Allowed return values for a BPF SK_LOOKUP program are SK_PASS and
+ * SK_DROP. Their meaning is as follows:
+ *
+ *  SK_PASS && ctx.selected_sk != NULL: use selected_sk as lookup result
+ *  SK_PASS && ctx.selected_sk == NULL: continue to htable-based socket lookup
+ *  SK_DROP                           : terminate lookup with -ECONNREFUSED
+ *
+ * This macro aggregates return values and selected sockets from
+ * multiple BPF programs according to following rules in order:
+ *
+ *  1. If any program returned SK_PASS and a non-NULL ctx.selected_sk,
+ *     macro result is SK_PASS and last ctx.selected_sk is used.
+ *  2. If any program returned SK_DROP return value,
+ *     macro result is SK_DROP.
+ *  3. Otherwise result is SK_PASS and ctx.selected_sk is NULL.
+ *
+ * Caller must ensure that the prog array is non-NULL, and that the
+ * array as well as the programs it contains remain valid.
+ */
+#define BPF_PROG_SK_LOOKUP_RUN_ARRAY(array, ctx, func)                 \
+       ({                                                              \
+               struct bpf_sk_lookup_kern *_ctx = &(ctx);               \
+               struct bpf_prog_array_item *_item;                      \
+               struct sock *_selected_sk = NULL;                       \
+               bool _no_reuseport = false;                             \
+               struct bpf_prog *_prog;                                 \
+               bool _all_pass = true;                                  \
+               u32 _ret;                                               \
+                                                                       \
+               migrate_disable();                                      \
+               _item = &(array)->items[0];                             \
+               while ((_prog = READ_ONCE(_item->prog))) {              \
+                       /* restore most recent selection */             \
+                       _ctx->selected_sk = _selected_sk;               \
+                       _ctx->no_reuseport = _no_reuseport;             \
+                                                                       \
+                       _ret = func(_prog, _ctx);                       \
+                       if (_ret == SK_PASS && _ctx->selected_sk) {     \
+                               /* remember last non-NULL socket */     \
+                               _selected_sk = _ctx->selected_sk;       \
+                               _no_reuseport = _ctx->no_reuseport;     \
+                       } else if (_ret == SK_DROP && _all_pass) {      \
+                               _all_pass = false;                      \
+                       }                                               \
+                       _item++;                                        \
+               }                                                       \
+               _ctx->selected_sk = _selected_sk;                       \
+               _ctx->no_reuseport = _no_reuseport;                     \
+               migrate_enable();                                       \
+               _all_pass || _selected_sk ? SK_PASS : SK_DROP;          \
+        })
+
+static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol,
+                                       const __be32 saddr, const __be16 sport,
+                                       const __be32 daddr, const u16 dport,
+                                       struct sock **psk)
+{
+       struct bpf_prog_array *run_array;
+       struct sock *selected_sk = NULL;
+       bool no_reuseport = false;
+
+       rcu_read_lock();
+       run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]);
+       if (run_array) {
+               struct bpf_sk_lookup_kern ctx = {
+                       .family         = AF_INET,
+                       .protocol       = protocol,
+                       .v4.saddr       = saddr,
+                       .v4.daddr       = daddr,
+                       .sport          = sport,
+                       .dport          = dport,
+               };
+               u32 act;
+
+               act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN);
+               if (act == SK_PASS) {
+                       selected_sk = ctx.selected_sk;
+                       no_reuseport = ctx.no_reuseport;
+               } else {
+                       selected_sk = ERR_PTR(-ECONNREFUSED);
+               }
+       }
+       rcu_read_unlock();
+       *psk = selected_sk;
+       return no_reuseport;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
+                                       const struct in6_addr *saddr,
+                                       const __be16 sport,
+                                       const struct in6_addr *daddr,
+                                       const u16 dport,
+                                       struct sock **psk)
+{
+       struct bpf_prog_array *run_array;
+       struct sock *selected_sk = NULL;
+       bool no_reuseport = false;
+
+       rcu_read_lock();
+       run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]);
+       if (run_array) {
+               struct bpf_sk_lookup_kern ctx = {
+                       .family         = AF_INET6,
+                       .protocol       = protocol,
+                       .v6.saddr       = saddr,
+                       .v6.daddr       = daddr,
+                       .sport          = sport,
+                       .dport          = dport,
+               };
+               u32 act;
+
+               act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, BPF_PROG_RUN);
+               if (act == SK_PASS) {
+                       selected_sk = ctx.selected_sk;
+                       no_reuseport = ctx.no_reuseport;
+               } else {
+                       selected_sk = ERR_PTR(-ECONNREFUSED);
+               }
+       }
+       rcu_read_unlock();
+       *psk = selected_sk;
+       return no_reuseport;
+}
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+
 #endif /* __LINUX_FILTER_H__ */
index d3005be..dbe9c60 100644 (file)
@@ -104,6 +104,7 @@ struct xdp_frame {
        struct net_device *dev_rx; /* used by cpumap */
 };
 
+
 static inline struct skb_shared_info *
 xdp_get_shared_info_from_frame(struct xdp_frame *frame)
 {
@@ -113,6 +114,12 @@ xdp_get_shared_info_from_frame(struct xdp_frame *frame)
                                SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
 }
 
+struct xdp_cpumap_stats {
+       unsigned int redirect;
+       unsigned int pass;
+       unsigned int drop;
+};
+
 /* Clear kernel pointers in xdp_frame */
 static inline void xdp_scrub_frame(struct xdp_frame *frame)
 {
@@ -136,39 +143,48 @@ void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)
        xdp->frame_sz = frame->frame_sz;
 }
 
-/* Convert xdp_buff to xdp_frame */
 static inline
-struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
+int xdp_update_frame_from_buff(struct xdp_buff *xdp,
+                              struct xdp_frame *xdp_frame)
 {
-       struct xdp_frame *xdp_frame;
-       int metasize;
-       int headroom;
-
-       if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
-               return xdp_convert_zc_to_xdp_frame(xdp);
+       int metasize, headroom;
 
        /* Assure headroom is available for storing info */
        headroom = xdp->data - xdp->data_hard_start;
        metasize = xdp->data - xdp->data_meta;
        metasize = metasize > 0 ? metasize : 0;
        if (unlikely((headroom - metasize) < sizeof(*xdp_frame)))
-               return NULL;
+               return -ENOSPC;
 
        /* Catch if driver didn't reserve tailroom for skb_shared_info */
        if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) {
                XDP_WARN("Driver BUG: missing reserved tailroom");
-               return NULL;
+               return -ENOSPC;
        }
 
-       /* Store info in top of packet */
-       xdp_frame = xdp->data_hard_start;
-
        xdp_frame->data = xdp->data;
        xdp_frame->len  = xdp->data_end - xdp->data;
        xdp_frame->headroom = headroom - sizeof(*xdp_frame);
        xdp_frame->metasize = metasize;
        xdp_frame->frame_sz = xdp->frame_sz;
 
+       return 0;
+}
+
+/* Convert xdp_buff to xdp_frame */
+static inline
+struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
+{
+       struct xdp_frame *xdp_frame;
+
+       if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
+               return xdp_convert_zc_to_xdp_frame(xdp);
+
+       /* Store info in top of packet */
+       xdp_frame = xdp->data_hard_start;
+       if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0))
+               return NULL;
+
        /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
        xdp_frame->mem = xdp->rxq->mem;
 
index b73d3e1..cd24e8a 100644 (file)
@@ -177,9 +177,9 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err,
 TRACE_EVENT(xdp_cpumap_kthread,
 
        TP_PROTO(int map_id, unsigned int processed,  unsigned int drops,
-                int sched),
+                int sched, struct xdp_cpumap_stats *xdp_stats),
 
-       TP_ARGS(map_id, processed, drops, sched),
+       TP_ARGS(map_id, processed, drops, sched, xdp_stats),
 
        TP_STRUCT__entry(
                __field(int, map_id)
@@ -188,6 +188,9 @@ TRACE_EVENT(xdp_cpumap_kthread,
                __field(unsigned int, drops)
                __field(unsigned int, processed)
                __field(int, sched)
+               __field(unsigned int, xdp_pass)
+               __field(unsigned int, xdp_drop)
+               __field(unsigned int, xdp_redirect)
        ),
 
        TP_fast_assign(
@@ -197,16 +200,21 @@ TRACE_EVENT(xdp_cpumap_kthread,
                __entry->drops          = drops;
                __entry->processed      = processed;
                __entry->sched  = sched;
+               __entry->xdp_pass       = xdp_stats->pass;
+               __entry->xdp_drop       = xdp_stats->drop;
+               __entry->xdp_redirect   = xdp_stats->redirect;
        ),
 
        TP_printk("kthread"
                  " cpu=%d map_id=%d action=%s"
                  " processed=%u drops=%u"
-                 " sched=%d",
+                 " sched=%d"
+                 " xdp_pass=%u xdp_drop=%u xdp_redirect=%u",
                  __entry->cpu, __entry->map_id,
                  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
                  __entry->processed, __entry->drops,
-                 __entry->sched)
+                 __entry->sched,
+                 __entry->xdp_pass, __entry->xdp_drop, __entry->xdp_redirect)
 );
 
 TRACE_EVENT(xdp_cpumap_enqueue,
index 5e38638..54d0c88 100644 (file)
@@ -189,6 +189,7 @@ enum bpf_prog_type {
        BPF_PROG_TYPE_STRUCT_OPS,
        BPF_PROG_TYPE_EXT,
        BPF_PROG_TYPE_LSM,
+       BPF_PROG_TYPE_SK_LOOKUP,
 };
 
 enum bpf_attach_type {
@@ -227,6 +228,8 @@ enum bpf_attach_type {
        BPF_CGROUP_INET6_GETSOCKNAME,
        BPF_XDP_DEVMAP,
        BPF_CGROUP_INET_SOCK_RELEASE,
+       BPF_XDP_CPUMAP,
+       BPF_SK_LOOKUP,
        __MAX_BPF_ATTACH_TYPE
 };
 
@@ -2419,7 +2422,7 @@ union bpf_attr {
  *                     Look for an IPv6 socket.
  *
  *             If the *netns* is a negative signed 32-bit integer, then the
- *             socket lookup table in the netns associated with the *ctx* will
+ *             socket lookup table in the netns associated with the *ctx*
  *             will be used. For the TC hooks, this is the netns of the device
  *             in the skb. For socket hooks, this is the netns of the socket.
  *             If *netns* is any other signed 32-bit value greater than or
@@ -2456,7 +2459,7 @@ union bpf_attr {
  *                     Look for an IPv6 socket.
  *
  *             If the *netns* is a negative signed 32-bit integer, then the
- *             socket lookup table in the netns associated with the *ctx* will
+ *             socket lookup table in the netns associated with the *ctx*
  *             will be used. For the TC hooks, this is the netns of the device
  *             in the skb. For socket hooks, this is the netns of the socket.
  *             If *netns* is any other signed 32-bit value greater than or
@@ -3068,6 +3071,10 @@ union bpf_attr {
  *
  * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
  *     Description
+ *             Helper is overloaded depending on BPF program type. This
+ *             description applies to **BPF_PROG_TYPE_SCHED_CLS** and
+ *             **BPF_PROG_TYPE_SCHED_ACT** programs.
+ *
  *             Assign the *sk* to the *skb*. When combined with appropriate
  *             routing configuration to receive the packet towards the socket,
  *             will cause *skb* to be delivered to the specified socket.
@@ -3093,6 +3100,56 @@ union bpf_attr {
  *             **-ESOCKTNOSUPPORT** if the socket type is not supported
  *             (reuseport).
  *
+ * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags)
+ *     Description
+ *             Helper is overloaded depending on BPF program type. This
+ *             description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs.
+ *
+ *             Select the *sk* as a result of a socket lookup.
+ *
+ *             For the operation to succeed passed socket must be compatible
+ *             with the packet description provided by the *ctx* object.
+ *
+ *             L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must
+ *             be an exact match. While IP family (**AF_INET** or
+ *             **AF_INET6**) must be compatible, that is IPv6 sockets
+ *             that are not v6-only can be selected for IPv4 packets.
+ *
+ *             Only TCP listeners and UDP unconnected sockets can be
+ *             selected. *sk* can also be NULL to reset any previous
+ *             selection.
+ *
+ *             *flags* argument can combination of following values:
+ *
+ *             * **BPF_SK_LOOKUP_F_REPLACE** to override the previous
+ *               socket selection, potentially done by a BPF program
+ *               that ran before us.
+ *
+ *             * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip
+ *               load-balancing within reuseport group for the socket
+ *               being selected.
+ *
+ *             On success *ctx->sk* will point to the selected socket.
+ *
+ *     Return
+ *             0 on success, or a negative errno in case of failure.
+ *
+ *             * **-EAFNOSUPPORT** if socket family (*sk->family*) is
+ *               not compatible with packet family (*ctx->family*).
+ *
+ *             * **-EEXIST** if socket has been already selected,
+ *               potentially by another program, and
+ *               **BPF_SK_LOOKUP_F_REPLACE** flag was not specified.
+ *
+ *             * **-EINVAL** if unsupported flags were specified.
+ *
+ *             * **-EPROTOTYPE** if socket L4 protocol
+ *               (*sk->protocol*) doesn't match packet protocol
+ *               (*ctx->protocol*).
+ *
+ *             * **-ESOCKTNOSUPPORT** if socket is not in allowed
+ *               state (TCP listening or UDP unconnected).
+ *
  * u64 bpf_ktime_get_boot_ns(void)
  *     Description
  *             Return the time elapsed since system boot, in nanoseconds.
@@ -3606,6 +3663,12 @@ enum {
        BPF_RINGBUF_HDR_SZ              = 8,
 };
 
+/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */
+enum {
+       BPF_SK_LOOKUP_F_REPLACE         = (1ULL << 0),
+       BPF_SK_LOOKUP_F_NO_REUSEPORT    = (1ULL << 1),
+};
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
        BPF_ADJ_ROOM_NET,
@@ -3849,6 +3912,19 @@ struct bpf_devmap_val {
        } bpf_prog;
 };
 
+/* CPUMAP map-value layout
+ *
+ * The struct data-layout of map-value is a configuration interface.
+ * New members can only be added to the end of this structure.
+ */
+struct bpf_cpumap_val {
+       __u32 qsize;    /* queue size to remote target CPU */
+       union {
+               int   fd;       /* prog fd on map write */
+               __u32 id;       /* prog id on map read */
+       } bpf_prog;
+};
+
 enum sk_action {
        SK_DROP = 0,
        SK_PASS,
@@ -3986,7 +4062,7 @@ struct bpf_link_info {
 
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
  * by user and intended to be used by socket (e.g. to bind to, depends on
- * attach attach type).
+ * attach type).
  */
 struct bpf_sock_addr {
        __u32 user_family;      /* Allows 4-byte read, but no write. */
@@ -4335,4 +4411,19 @@ struct bpf_pidns_info {
        __u32 pid;
        __u32 tgid;
 };
+
+/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
+struct bpf_sk_lookup {
+       __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+
+       __u32 family;           /* Protocol family (AF_INET, AF_INET6) */
+       __u32 protocol;         /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
+       __u32 remote_ip4;       /* Network byte order */
+       __u32 remote_ip6[4];    /* Network byte order */
+       __u32 remote_port;      /* Network byte order */
+       __u32 local_ip4;        /* Network byte order */
+       __u32 local_ip6[4];     /* Network byte order */
+       __u32 local_port;       /* Host byte order */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
index 03d6d43..ee36b7f 100644 (file)
@@ -3672,7 +3672,6 @@ struct btf *btf_parse_vmlinux(void)
                goto errout;
 
        bpf_struct_ops_init(btf, log);
-       init_btf_sock_ids(btf);
 
        btf_verifier_env_free(env);
        refcount_set(&btf->refcnt, 1);
@@ -3818,16 +3817,17 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
                return true;
 
        /* this is a pointer to another type */
-       info->reg_type = PTR_TO_BTF_ID;
        for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
                const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
 
                if (ctx_arg_info->offset == off) {
                        info->reg_type = ctx_arg_info->reg_type;
-                       break;
+                       info->btf_id = ctx_arg_info->btf_id;
+                       return true;
                }
        }
 
+       info->reg_type = PTR_TO_BTF_ID;
        if (tgt_prog) {
                ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
                if (ret > 0) {
index 9df4cc9..7be02e5 100644 (file)
@@ -1958,6 +1958,61 @@ void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
                }
 }
 
+/**
+ * bpf_prog_array_delete_safe_at() - Replaces the program at the given
+ *                                   index into the program array with
+ *                                   a dummy no-op program.
+ * @array: a bpf_prog_array
+ * @index: the index of the program to replace
+ *
+ * Skips over dummy programs, by not counting them, when calculating
+ * the the position of the program to replace.
+ *
+ * Return:
+ * * 0         - Success
+ * * -EINVAL   - Invalid index value. Must be a non-negative integer.
+ * * -ENOENT   - Index out of range
+ */
+int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index)
+{
+       return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog);
+}
+
+/**
+ * bpf_prog_array_update_at() - Updates the program at the given index
+ *                              into the program array.
+ * @array: a bpf_prog_array
+ * @index: the index of the program to update
+ * @prog: the program to insert into the array
+ *
+ * Skips over dummy programs, by not counting them, when calculating
+ * the position of the program to update.
+ *
+ * Return:
+ * * 0         - Success
+ * * -EINVAL   - Invalid index value. Must be a non-negative integer.
+ * * -ENOENT   - Index out of range
+ */
+int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
+                            struct bpf_prog *prog)
+{
+       struct bpf_prog_array_item *item;
+
+       if (unlikely(index < 0))
+               return -EINVAL;
+
+       for (item = array->items; item->prog; item++) {
+               if (item->prog == &dummy_bpf_prog.prog)
+                       continue;
+               if (!index) {
+                       WRITE_ONCE(item->prog, prog);
+                       return 0;
+               }
+               index--;
+       }
+       return -ENOENT;
+}
+
 int bpf_prog_array_copy(struct bpf_prog_array *old_array,
                        struct bpf_prog *exclude_prog,
                        struct bpf_prog *include_prog,
index bd86580..f1c4652 100644 (file)
@@ -52,7 +52,6 @@ struct xdp_bulk_queue {
 struct bpf_cpu_map_entry {
        u32 cpu;    /* kthread CPU and map index */
        int map_id; /* Back reference to map */
-       u32 qsize;  /* Queue size placeholder for map lookup */
 
        /* XDP can run multiple RX-ring queues, need __percpu enqueue store */
        struct xdp_bulk_queue __percpu *bulkq;
@@ -62,10 +61,14 @@ struct bpf_cpu_map_entry {
        /* Queue with potential multi-producers, and single-consumer kthread */
        struct ptr_ring *queue;
        struct task_struct *kthread;
-       struct work_struct kthread_stop_wq;
+
+       struct bpf_cpumap_val value;
+       struct bpf_prog *prog;
 
        atomic_t refcnt; /* Control when this struct can be free'ed */
        struct rcu_head rcu;
+
+       struct work_struct kthread_stop_wq;
 };
 
 struct bpf_cpu_map {
@@ -80,6 +83,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq);
 
 static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 {
+       u32 value_size = attr->value_size;
        struct bpf_cpu_map *cmap;
        int err = -ENOMEM;
        u64 cost;
@@ -90,7 +94,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 
        /* check sanity of attributes */
        if (attr->max_entries == 0 || attr->key_size != 4 ||
-           attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+           (value_size != offsetofend(struct bpf_cpumap_val, qsize) &&
+            value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) ||
+           attr->map_flags & ~BPF_F_NUMA_NODE)
                return ERR_PTR(-EINVAL);
 
        cmap = kzalloc(sizeof(*cmap), GFP_USER);
@@ -212,6 +218,8 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
 static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
 {
        if (atomic_dec_and_test(&rcpu->refcnt)) {
+               if (rcpu->prog)
+                       bpf_prog_put(rcpu->prog);
                /* The queue should be empty at this point */
                __cpu_map_ring_cleanup(rcpu->queue);
                ptr_ring_cleanup(rcpu->queue, NULL);
@@ -220,6 +228,75 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
        }
 }
 
+static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
+                                   void **frames, int n,
+                                   struct xdp_cpumap_stats *stats)
+{
+       struct xdp_rxq_info rxq;
+       struct xdp_buff xdp;
+       int i, nframes = 0;
+
+       if (!rcpu->prog)
+               return n;
+
+       rcu_read_lock_bh();
+
+       xdp_set_return_frame_no_direct();
+       xdp.rxq = &rxq;
+
+       for (i = 0; i < n; i++) {
+               struct xdp_frame *xdpf = frames[i];
+               u32 act;
+               int err;
+
+               rxq.dev = xdpf->dev_rx;
+               rxq.mem = xdpf->mem;
+               /* TODO: report queue_index to xdp_rxq_info */
+
+               xdp_convert_frame_to_buff(xdpf, &xdp);
+
+               act = bpf_prog_run_xdp(rcpu->prog, &xdp);
+               switch (act) {
+               case XDP_PASS:
+                       err = xdp_update_frame_from_buff(&xdp, xdpf);
+                       if (err < 0) {
+                               xdp_return_frame(xdpf);
+                               stats->drop++;
+                       } else {
+                               frames[nframes++] = xdpf;
+                               stats->pass++;
+                       }
+                       break;
+               case XDP_REDIRECT:
+                       err = xdp_do_redirect(xdpf->dev_rx, &xdp,
+                                             rcpu->prog);
+                       if (unlikely(err)) {
+                               xdp_return_frame(xdpf);
+                               stats->drop++;
+                       } else {
+                               stats->redirect++;
+                       }
+                       break;
+               default:
+                       bpf_warn_invalid_xdp_action(act);
+                       /* fallthrough */
+               case XDP_DROP:
+                       xdp_return_frame(xdpf);
+                       stats->drop++;
+                       break;
+               }
+       }
+
+       if (stats->redirect)
+               xdp_do_flush_map();
+
+       xdp_clear_return_frame_no_direct();
+
+       rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
+
+       return nframes;
+}
+
 #define CPUMAP_BATCH 8
 
 static int cpu_map_kthread_run(void *data)
@@ -234,11 +311,12 @@ static int cpu_map_kthread_run(void *data)
         * kthread_stop signal until queue is empty.
         */
        while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+               struct xdp_cpumap_stats stats = {}; /* zero stats */
+               gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
                unsigned int drops = 0, sched = 0;
                void *frames[CPUMAP_BATCH];
                void *skbs[CPUMAP_BATCH];
-               gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
-               int i, n, m;
+               int i, n, m, nframes;
 
                /* Release CPU reschedule checks */
                if (__ptr_ring_empty(rcpu->queue)) {
@@ -259,8 +337,8 @@ static int cpu_map_kthread_run(void *data)
                 * kthread CPU pinned. Lockless access to ptr_ring
                 * consume side valid as no-resize allowed of queue.
                 */
-               n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH);
-
+               n = __ptr_ring_consume_batched(rcpu->queue, frames,
+                                              CPUMAP_BATCH);
                for (i = 0; i < n; i++) {
                        void *f = frames[i];
                        struct page *page = virt_to_page(f);
@@ -272,15 +350,19 @@ static int cpu_map_kthread_run(void *data)
                        prefetchw(page);
                }
 
-               m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs);
-               if (unlikely(m == 0)) {
-                       for (i = 0; i < n; i++)
-                               skbs[i] = NULL; /* effect: xdp_return_frame */
-                       drops = n;
+               /* Support running another XDP prog on this CPU */
+               nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n, &stats);
+               if (nframes) {
+                       m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs);
+                       if (unlikely(m == 0)) {
+                               for (i = 0; i < nframes; i++)
+                                       skbs[i] = NULL; /* effect: xdp_return_frame */
+                               drops += nframes;
+                       }
                }
 
                local_bh_disable();
-               for (i = 0; i < n; i++) {
+               for (i = 0; i < nframes; i++) {
                        struct xdp_frame *xdpf = frames[i];
                        struct sk_buff *skb = skbs[i];
                        int ret;
@@ -297,7 +379,7 @@ static int cpu_map_kthread_run(void *data)
                                drops++;
                }
                /* Feedback loop via tracepoint */
-               trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched);
+               trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched, &stats);
 
                local_bh_enable(); /* resched point, may call do_softirq() */
        }
@@ -307,13 +389,38 @@ static int cpu_map_kthread_run(void *data)
        return 0;
 }
 
-static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
-                                                      int map_id)
+bool cpu_map_prog_allowed(struct bpf_map *map)
 {
+       return map->map_type == BPF_MAP_TYPE_CPUMAP &&
+              map->value_size != offsetofend(struct bpf_cpumap_val, qsize);
+}
+
+static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
+{
+       struct bpf_prog *prog;
+
+       prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
+       if (IS_ERR(prog))
+               return PTR_ERR(prog);
+
+       if (prog->expected_attach_type != BPF_XDP_CPUMAP) {
+               bpf_prog_put(prog);
+               return -EINVAL;
+       }
+
+       rcpu->value.bpf_prog.id = prog->aux->id;
+       rcpu->prog = prog;
+
+       return 0;
+}
+
+static struct bpf_cpu_map_entry *
+__cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id)
+{
+       int numa, err, i, fd = value->bpf_prog.fd;
        gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
        struct bpf_cpu_map_entry *rcpu;
        struct xdp_bulk_queue *bq;
-       int numa, err, i;
 
        /* Have map->numa_node, but choose node of redirect target CPU */
        numa = cpu_to_node(cpu);
@@ -338,19 +445,22 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
        if (!rcpu->queue)
                goto free_bulkq;
 
-       err = ptr_ring_init(rcpu->queue, qsize, gfp);
+       err = ptr_ring_init(rcpu->queue, value->qsize, gfp);
        if (err)
                goto free_queue;
 
        rcpu->cpu    = cpu;
        rcpu->map_id = map_id;
-       rcpu->qsize  = qsize;
+       rcpu->value.qsize  = value->qsize;
+
+       if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd))
+               goto free_ptr_ring;
 
        /* Setup kthread */
        rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
                                               "cpumap/%d/map:%d", cpu, map_id);
        if (IS_ERR(rcpu->kthread))
-               goto free_ptr_ring;
+               goto free_prog;
 
        get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
        get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
@@ -361,6 +471,9 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
 
        return rcpu;
 
+free_prog:
+       if (rcpu->prog)
+               bpf_prog_put(rcpu->prog);
 free_ptr_ring:
        ptr_ring_cleanup(rcpu->queue, NULL);
 free_queue:
@@ -437,12 +550,12 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
                               u64 map_flags)
 {
        struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+       struct bpf_cpumap_val cpumap_value = {};
        struct bpf_cpu_map_entry *rcpu;
-
        /* Array index key correspond to CPU number */
        u32 key_cpu = *(u32 *)key;
-       /* Value is the queue size */
-       u32 qsize = *(u32 *)value;
+
+       memcpy(&cpumap_value, value, map->value_size);
 
        if (unlikely(map_flags > BPF_EXIST))
                return -EINVAL;
@@ -450,18 +563,18 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
                return -E2BIG;
        if (unlikely(map_flags == BPF_NOEXIST))
                return -EEXIST;
-       if (unlikely(qsize > 16384)) /* sanity limit on qsize */
+       if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */
                return -EOVERFLOW;
 
        /* Make sure CPU is a valid possible cpu */
        if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu))
                return -ENODEV;
 
-       if (qsize == 0) {
+       if (cpumap_value.qsize == 0) {
                rcpu = NULL; /* Same as deleting */
        } else {
                /* Updating qsize cause re-allocation of bpf_cpu_map_entry */
-               rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
+               rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id);
                if (!rcpu)
                        return -ENOMEM;
                rcpu->cmap = cmap;
@@ -523,7 +636,7 @@ static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
        struct bpf_cpu_map_entry *rcpu =
                __cpu_map_lookup_elem(map, *(u32 *)key);
 
-       return rcpu ? &rcpu->qsize : NULL;
+       return rcpu ? &rcpu->value : NULL;
 }
 
 static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
index c69071e..8a7af11 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/fs.h>
 #include <linux/filter.h>
 #include <linux/kernel.h>
+#include <linux/btf_ids.h>
 
 struct bpf_iter_seq_map_info {
        u32 mid;
@@ -81,7 +82,10 @@ static const struct seq_operations bpf_map_seq_ops = {
        .show   = bpf_map_seq_show,
 };
 
-static const struct bpf_iter_reg bpf_map_reg_info = {
+BTF_ID_LIST(btf_bpf_map_id)
+BTF_ID(struct, bpf_map)
+
+static struct bpf_iter_reg bpf_map_reg_info = {
        .target                 = "bpf_map",
        .seq_ops                = &bpf_map_seq_ops,
        .init_seq_private       = NULL,
@@ -96,6 +100,7 @@ static const struct bpf_iter_reg bpf_map_reg_info = {
 
 static int __init bpf_map_iter_init(void)
 {
+       bpf_map_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_map_id;
        return bpf_iter_reg_target(&bpf_map_reg_info);
 }
 
index 310241c..71405ed 100644 (file)
@@ -25,6 +25,32 @@ struct bpf_netns_link {
 /* Protects updates to netns_bpf */
 DEFINE_MUTEX(netns_bpf_mutex);
 
+static void netns_bpf_attach_type_unneed(enum netns_bpf_attach_type type)
+{
+       switch (type) {
+#ifdef CONFIG_INET
+       case NETNS_BPF_SK_LOOKUP:
+               static_branch_dec(&bpf_sk_lookup_enabled);
+               break;
+#endif
+       default:
+               break;
+       }
+}
+
+static void netns_bpf_attach_type_need(enum netns_bpf_attach_type type)
+{
+       switch (type) {
+#ifdef CONFIG_INET
+       case NETNS_BPF_SK_LOOKUP:
+               static_branch_inc(&bpf_sk_lookup_enabled);
+               break;
+#endif
+       default:
+               break;
+       }
+}
+
 /* Must be called with netns_bpf_mutex held. */
 static void netns_bpf_run_array_detach(struct net *net,
                                       enum netns_bpf_attach_type type)
@@ -36,12 +62,50 @@ static void netns_bpf_run_array_detach(struct net *net,
        bpf_prog_array_free(run_array);
 }
 
+static int link_index(struct net *net, enum netns_bpf_attach_type type,
+                     struct bpf_netns_link *link)
+{
+       struct bpf_netns_link *pos;
+       int i = 0;
+
+       list_for_each_entry(pos, &net->bpf.links[type], node) {
+               if (pos == link)
+                       return i;
+               i++;
+       }
+       return -ENOENT;
+}
+
+static int link_count(struct net *net, enum netns_bpf_attach_type type)
+{
+       struct list_head *pos;
+       int i = 0;
+
+       list_for_each(pos, &net->bpf.links[type])
+               i++;
+       return i;
+}
+
+static void fill_prog_array(struct net *net, enum netns_bpf_attach_type type,
+                           struct bpf_prog_array *prog_array)
+{
+       struct bpf_netns_link *pos;
+       unsigned int i = 0;
+
+       list_for_each_entry(pos, &net->bpf.links[type], node) {
+               prog_array->items[i].prog = pos->link.prog;
+               i++;
+       }
+}
+
 static void bpf_netns_link_release(struct bpf_link *link)
 {
        struct bpf_netns_link *net_link =
                container_of(link, struct bpf_netns_link, link);
        enum netns_bpf_attach_type type = net_link->netns_type;
+       struct bpf_prog_array *old_array, *new_array;
        struct net *net;
+       int cnt, idx;
 
        mutex_lock(&netns_bpf_mutex);
 
@@ -53,9 +117,30 @@ static void bpf_netns_link_release(struct bpf_link *link)
        if (!net)
                goto out_unlock;
 
-       netns_bpf_run_array_detach(net, type);
+       /* Mark attach point as unused */
+       netns_bpf_attach_type_unneed(type);
+
+       /* Remember link position in case of safe delete */
+       idx = link_index(net, type, net_link);
        list_del(&net_link->node);
 
+       cnt = link_count(net, type);
+       if (!cnt) {
+               netns_bpf_run_array_detach(net, type);
+               goto out_unlock;
+       }
+
+       old_array = rcu_dereference_protected(net->bpf.run_array[type],
+                                             lockdep_is_held(&netns_bpf_mutex));
+       new_array = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+       if (!new_array) {
+               WARN_ON(bpf_prog_array_delete_safe_at(old_array, idx));
+               goto out_unlock;
+       }
+       fill_prog_array(net, type, new_array);
+       rcu_assign_pointer(net->bpf.run_array[type], new_array);
+       bpf_prog_array_free(old_array);
+
 out_unlock:
        mutex_unlock(&netns_bpf_mutex);
 }
@@ -77,7 +162,7 @@ static int bpf_netns_link_update_prog(struct bpf_link *link,
        enum netns_bpf_attach_type type = net_link->netns_type;
        struct bpf_prog_array *run_array;
        struct net *net;
-       int ret = 0;
+       int idx, ret;
 
        if (old_prog && old_prog != link->prog)
                return -EPERM;
@@ -95,7 +180,10 @@ static int bpf_netns_link_update_prog(struct bpf_link *link,
 
        run_array = rcu_dereference_protected(net->bpf.run_array[type],
                                              lockdep_is_held(&netns_bpf_mutex));
-       WRITE_ONCE(run_array->items[0].prog, new_prog);
+       idx = link_index(net, type, net_link);
+       ret = bpf_prog_array_update_at(run_array, idx, new_prog);
+       if (ret)
+               goto out_unlock;
 
        old_prog = xchg(&link->prog, new_prog);
        bpf_prog_put(old_prog);
@@ -309,18 +397,30 @@ int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
        return ret;
 }
 
+static int netns_bpf_max_progs(enum netns_bpf_attach_type type)
+{
+       switch (type) {
+       case NETNS_BPF_FLOW_DISSECTOR:
+               return 1;
+       case NETNS_BPF_SK_LOOKUP:
+               return 64;
+       default:
+               return 0;
+       }
+}
+
 static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
                                 enum netns_bpf_attach_type type)
 {
        struct bpf_netns_link *net_link =
                container_of(link, struct bpf_netns_link, link);
        struct bpf_prog_array *run_array;
-       int err;
+       int cnt, err;
 
        mutex_lock(&netns_bpf_mutex);
 
-       /* Allow attaching only one prog or link for now */
-       if (!list_empty(&net->bpf.links[type])) {
+       cnt = link_count(net, type);
+       if (cnt >= netns_bpf_max_progs(type)) {
                err = -E2BIG;
                goto out_unlock;
        }
@@ -334,6 +434,9 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
        case NETNS_BPF_FLOW_DISSECTOR:
                err = flow_dissector_bpf_prog_attach_check(net, link->prog);
                break;
+       case NETNS_BPF_SK_LOOKUP:
+               err = 0; /* nothing to check */
+               break;
        default:
                err = -EINVAL;
                break;
@@ -341,16 +444,22 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
        if (err)
                goto out_unlock;
 
-       run_array = bpf_prog_array_alloc(1, GFP_KERNEL);
+       run_array = bpf_prog_array_alloc(cnt + 1, GFP_KERNEL);
        if (!run_array) {
                err = -ENOMEM;
                goto out_unlock;
        }
-       run_array->items[0].prog = link->prog;
-       rcu_assign_pointer(net->bpf.run_array[type], run_array);
 
        list_add_tail(&net_link->node, &net->bpf.links[type]);
 
+       fill_prog_array(net, type, run_array);
+       run_array = rcu_replace_pointer(net->bpf.run_array[type], run_array,
+                                       lockdep_is_held(&netns_bpf_mutex));
+       bpf_prog_array_free(run_array);
+
+       /* Mark attach point as used */
+       netns_bpf_attach_type_need(type);
+
 out_unlock:
        mutex_unlock(&netns_bpf_mutex);
        return err;
@@ -426,8 +535,10 @@ static void __net_exit netns_bpf_pernet_pre_exit(struct net *net)
        mutex_lock(&netns_bpf_mutex);
        for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) {
                netns_bpf_run_array_detach(net, type);
-               list_for_each_entry(net_link, &net->bpf.links[type], node)
+               list_for_each_entry(net_link, &net->bpf.links[type], node) {
                        net_link->net = NULL; /* auto-detach link */
+                       netns_bpf_attach_type_unneed(type);
+               }
                if (net->bpf.progs[type])
                        bpf_prog_put(net->bpf.progs[type]);
        }
index 7ea9dfb..d07417d 100644 (file)
@@ -2022,6 +2022,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
                default:
                        return -EINVAL;
                }
+       case BPF_PROG_TYPE_SK_LOOKUP:
+               if (expected_attach_type == BPF_SK_LOOKUP)
+                       return 0;
+               return -EINVAL;
        case BPF_PROG_TYPE_EXT:
                if (expected_attach_type)
                        return -EINVAL;
@@ -2756,6 +2760,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
        case BPF_PROG_TYPE_CGROUP_SOCK:
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
        case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+       case BPF_PROG_TYPE_SK_LOOKUP:
                return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
        case BPF_PROG_TYPE_CGROUP_SKB:
                if (!capable(CAP_NET_ADMIN))
@@ -2817,6 +2822,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
                return BPF_PROG_TYPE_CGROUP_SOCKOPT;
        case BPF_TRACE_ITER:
                return BPF_PROG_TYPE_TRACING;
+       case BPF_SK_LOOKUP:
+               return BPF_PROG_TYPE_SK_LOOKUP;
        default:
                return BPF_PROG_TYPE_UNSPEC;
        }
@@ -2953,6 +2960,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
        case BPF_LIRC_MODE2:
                return lirc_prog_query(attr, uattr);
        case BPF_FLOW_DISSECTOR:
+       case BPF_SK_LOOKUP:
                return netns_bpf_prog_query(attr, uattr);
        default:
                return -EINVAL;
@@ -3891,6 +3899,7 @@ static int link_create(union bpf_attr *attr)
                ret = tracing_bpf_link_attach(attr, prog);
                break;
        case BPF_PROG_TYPE_FLOW_DISSECTOR:
+       case BPF_PROG_TYPE_SK_LOOKUP:
                ret = netns_bpf_link_create(attr, prog);
                break;
        default:
index 4dbf2b6..2feecf0 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/fdtable.h>
 #include <linux/filter.h>
+#include <linux/btf_ids.h>
 
 struct bpf_iter_seq_task_common {
        struct pid_namespace *ns;
@@ -312,7 +313,11 @@ static const struct seq_operations task_file_seq_ops = {
        .show   = task_file_seq_show,
 };
 
-static const struct bpf_iter_reg task_reg_info = {
+BTF_ID_LIST(btf_task_file_ids)
+BTF_ID(struct, task_struct)
+BTF_ID(struct, file)
+
+static struct bpf_iter_reg task_reg_info = {
        .target                 = "task",
        .seq_ops                = &task_seq_ops,
        .init_seq_private       = init_seq_pidns,
@@ -325,7 +330,7 @@ static const struct bpf_iter_reg task_reg_info = {
        },
 };
 
-static const struct bpf_iter_reg task_file_reg_info = {
+static struct bpf_iter_reg task_file_reg_info = {
        .target                 = "task_file",
        .seq_ops                = &task_file_seq_ops,
        .init_seq_private       = init_seq_pidns,
@@ -344,10 +349,13 @@ static int __init task_iter_init(void)
 {
        int ret;
 
+       task_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
        ret = bpf_iter_reg_target(&task_reg_info);
        if (ret)
                return ret;
 
+       task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0];
+       task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
        return bpf_iter_reg_target(&task_file_reg_info);
 }
 late_initcall(task_iter_init);
index 3c1efc9..9a6703b 100644 (file)
@@ -3878,10 +3878,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
                        }
                        meta->ref_obj_id = reg->ref_obj_id;
                }
-       } else if (arg_type == ARG_PTR_TO_SOCKET) {
+       } else if (arg_type == ARG_PTR_TO_SOCKET ||
+                  arg_type == ARG_PTR_TO_SOCKET_OR_NULL) {
                expected_type = PTR_TO_SOCKET;
-               if (type != expected_type)
-                       goto err_type;
+               if (!(register_is_null(reg) &&
+                     arg_type == ARG_PTR_TO_SOCKET_OR_NULL)) {
+                       if (type != expected_type)
+                               goto err_type;
+               }
        } else if (arg_type == ARG_PTR_TO_BTF_ID) {
                expected_type = PTR_TO_BTF_ID;
                if (type != expected_type)
@@ -7354,6 +7358,9 @@ static int check_return_code(struct bpf_verifier_env *env)
                        return -ENOTSUPP;
                }
                break;
+       case BPF_PROG_TYPE_SK_LOOKUP:
+               range = tnum_range(SK_DROP, SK_PASS);
+               break;
        case BPF_PROG_TYPE_EXT:
                /* freplace program can return anything as its return value
                 * depends on the to-be-replaced kernel func or bpf program.
index a5fddf9..ca7d635 100644 (file)
@@ -5275,31 +5275,21 @@ static struct bpf_test tests[] = {
        {       /* Mainly checking JIT here. */
                "BPF_MAXINSNS: Ctx heavy transformations",
                { },
-#if defined(CONFIG_BPF_JIT_ALWAYS_ON) && defined(CONFIG_S390)
-               CLASSIC | FLAG_EXPECTED_FAIL,
-#else
                CLASSIC,
-#endif
                { },
                {
                        {  1, SKB_VLAN_PRESENT },
                        { 10, SKB_VLAN_PRESENT }
                },
                .fill_helper = bpf_fill_maxinsns6,
-               .expected_errcode = -ENOTSUPP,
        },
        {       /* Mainly checking JIT here. */
                "BPF_MAXINSNS: Call heavy transformations",
                { },
-#if defined(CONFIG_BPF_JIT_ALWAYS_ON) && defined(CONFIG_S390)
-               CLASSIC | FLAG_NO_DATA | FLAG_EXPECTED_FAIL,
-#else
                CLASSIC | FLAG_NO_DATA,
-#endif
                { },
                { { 1, 0 }, { 10, 0 } },
                .fill_helper = bpf_fill_maxinsns7,
-               .expected_errcode = -ENOTSUPP,
        },
        {       /* Mainly checking JIT here. */
                "BPF_MAXINSNS: Jump heavy test",
@@ -5350,28 +5340,18 @@ static struct bpf_test tests[] = {
        {
                "BPF_MAXINSNS: exec all MSH",
                { },
-#if defined(CONFIG_BPF_JIT_ALWAYS_ON) && defined(CONFIG_S390)
-               CLASSIC | FLAG_EXPECTED_FAIL,
-#else
                CLASSIC,
-#endif
                { 0xfa, 0xfb, 0xfc, 0xfd, },
                { { 4, 0xababab83 } },
                .fill_helper = bpf_fill_maxinsns13,
-               .expected_errcode = -ENOTSUPP,
        },
        {
                "BPF_MAXINSNS: ld_abs+get_processor_id",
                { },
-#if defined(CONFIG_BPF_JIT_ALWAYS_ON) && defined(CONFIG_S390)
-               CLASSIC | FLAG_EXPECTED_FAIL,
-#else
                CLASSIC,
-#endif
                { },
                { { 1, 0xbee } },
                .fill_helper = bpf_fill_ld_abs_get_processor_id,
-               .expected_errcode = -ENOTSUPP,
        },
        /*
         * LD_IND / LD_ABS on fragmented SKBs
index 19f1abc..316349f 100644 (file)
@@ -5449,6 +5449,8 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
                for (i = 0; i < new->aux->used_map_cnt; i++) {
                        if (dev_map_can_have_prog(new->aux->used_maps[i]))
                                return -EINVAL;
+                       if (cpu_map_prog_allowed(new->aux->used_maps[i]))
+                               return -EINVAL;
                }
        }
 
@@ -8880,6 +8882,13 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
                        return -EINVAL;
                }
 
+               if (prog->expected_attach_type == BPF_XDP_CPUMAP) {
+                       NL_SET_ERR_MSG(extack,
+                                      "BPF_XDP_CPUMAP programs can not be attached to a device");
+                       bpf_prog_put(prog);
+                       return -EINVAL;
+               }
+
                /* prog->aux->id may be 0 for orphaned device-bound progs */
                if (prog->aux->id && prog->aux->id == prog_id) {
                        bpf_prog_put(prog);
index 2bf6624..3fa16b8 100644 (file)
@@ -9252,61 +9252,205 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
 
 const struct bpf_prog_ops sk_reuseport_prog_ops = {
 };
-#endif /* CONFIG_INET */
 
-DEFINE_BPF_DISPATCHER(xdp)
+DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled);
+EXPORT_SYMBOL(bpf_sk_lookup_enabled);
 
-void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
+BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx,
+          struct sock *, sk, u64, flags)
 {
-       bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
+       if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE |
+                              BPF_SK_LOOKUP_F_NO_REUSEPORT)))
+               return -EINVAL;
+       if (unlikely(sk && sk_is_refcounted(sk)))
+               return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */
+       if (unlikely(sk && sk->sk_state == TCP_ESTABLISHED))
+               return -ESOCKTNOSUPPORT; /* reject connected sockets */
+
+       /* Check if socket is suitable for packet L3/L4 protocol */
+       if (sk && sk->sk_protocol != ctx->protocol)
+               return -EPROTOTYPE;
+       if (sk && sk->sk_family != ctx->family &&
+           (sk->sk_family == AF_INET || ipv6_only_sock(sk)))
+               return -EAFNOSUPPORT;
+
+       if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE))
+               return -EEXIST;
+
+       /* Select socket as lookup result */
+       ctx->selected_sk = sk;
+       ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT;
+       return 0;
 }
 
-/* Define a list of socket types which can be the argument for
- * skc_to_*_sock() helpers. All these sockets should have
- * sock_common as the first argument in its memory layout.
- */
-#define BTF_SOCK_TYPE_xxx \
-       BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET, "inet_sock")                  \
-       BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_CONN, "inet_connection_sock")  \
-       BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_REQ, "inet_request_sock")      \
-       BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_TW, "inet_timewait_sock")      \
-       BTF_SOCK_TYPE(BTF_SOCK_TYPE_REQ, "request_sock")                \
-       BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK, "sock")                       \
-       BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK_COMMON, "sock_common")         \
-       BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP, "tcp_sock")                    \
-       BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_REQ, "tcp_request_sock")        \
-       BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, "tcp_timewait_sock")        \
-       BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, "tcp6_sock")                  \
-       BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, "udp_sock")                    \
-       BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, "udp6_sock")
-
-enum {
-#define BTF_SOCK_TYPE(name, str) name,
-BTF_SOCK_TYPE_xxx
-#undef BTF_SOCK_TYPE
-MAX_BTF_SOCK_TYPE,
+static const struct bpf_func_proto bpf_sk_lookup_assign_proto = {
+       .func           = bpf_sk_lookup_assign,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_PTR_TO_SOCKET_OR_NULL,
+       .arg3_type      = ARG_ANYTHING,
 };
 
-static int btf_sock_ids[MAX_BTF_SOCK_TYPE];
+static const struct bpf_func_proto *
+sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+       switch (func_id) {
+       case BPF_FUNC_perf_event_output:
+               return &bpf_event_output_data_proto;
+       case BPF_FUNC_sk_assign:
+               return &bpf_sk_lookup_assign_proto;
+       case BPF_FUNC_sk_release:
+               return &bpf_sk_release_proto;
+       default:
+               return bpf_base_func_proto(func_id);
+       }
+}
 
-#ifdef CONFIG_BPF_SYSCALL
-static const char *bpf_sock_types[] = {
-#define BTF_SOCK_TYPE(name, str) str,
-BTF_SOCK_TYPE_xxx
-#undef BTF_SOCK_TYPE
-};
+static bool sk_lookup_is_valid_access(int off, int size,
+                                     enum bpf_access_type type,
+                                     const struct bpf_prog *prog,
+                                     struct bpf_insn_access_aux *info)
+{
+       if (off < 0 || off >= sizeof(struct bpf_sk_lookup))
+               return false;
+       if (off % size != 0)
+               return false;
+       if (type != BPF_READ)
+               return false;
+
+       switch (off) {
+       case offsetof(struct bpf_sk_lookup, sk):
+               info->reg_type = PTR_TO_SOCKET_OR_NULL;
+               return size == sizeof(__u64);
 
-void init_btf_sock_ids(struct btf *btf)
+       case bpf_ctx_range(struct bpf_sk_lookup, family):
+       case bpf_ctx_range(struct bpf_sk_lookup, protocol):
+       case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4):
+       case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
+       case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
+       case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
+       case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
+       case bpf_ctx_range(struct bpf_sk_lookup, local_port):
+               bpf_ctx_record_field_size(info, sizeof(__u32));
+               return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));
+
+       default:
+               return false;
+       }
+}
+
+static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
+                                       const struct bpf_insn *si,
+                                       struct bpf_insn *insn_buf,
+                                       struct bpf_prog *prog,
+                                       u32 *target_size)
 {
-       int i, btf_id;
+       struct bpf_insn *insn = insn_buf;
+
+       switch (si->off) {
+       case offsetof(struct bpf_sk_lookup, sk):
+               *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
+                                     offsetof(struct bpf_sk_lookup_kern, selected_sk));
+               break;
 
-       for (i = 0; i < MAX_BTF_SOCK_TYPE; i++) {
-               btf_id = btf_find_by_name_kind(btf, bpf_sock_types[i],
-                                              BTF_KIND_STRUCT);
-               if (btf_id > 0)
-                       btf_sock_ids[i] = btf_id;
+       case offsetof(struct bpf_sk_lookup, family):
+               *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+                                     bpf_target_off(struct bpf_sk_lookup_kern,
+                                                    family, 2, target_size));
+               break;
+
+       case offsetof(struct bpf_sk_lookup, protocol):
+               *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+                                     bpf_target_off(struct bpf_sk_lookup_kern,
+                                                    protocol, 2, target_size));
+               break;
+
+       case offsetof(struct bpf_sk_lookup, remote_ip4):
+               *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+                                     bpf_target_off(struct bpf_sk_lookup_kern,
+                                                    v4.saddr, 4, target_size));
+               break;
+
+       case offsetof(struct bpf_sk_lookup, local_ip4):
+               *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+                                     bpf_target_off(struct bpf_sk_lookup_kern,
+                                                    v4.daddr, 4, target_size));
+               break;
+
+       case bpf_ctx_range_till(struct bpf_sk_lookup,
+                               remote_ip6[0], remote_ip6[3]): {
+#if IS_ENABLED(CONFIG_IPV6)
+               int off = si->off;
+
+               off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]);
+               off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
+               *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
+                                     offsetof(struct bpf_sk_lookup_kern, v6.saddr));
+               *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+               *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
+#else
+               *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+               break;
+       }
+       case bpf_ctx_range_till(struct bpf_sk_lookup,
+                               local_ip6[0], local_ip6[3]): {
+#if IS_ENABLED(CONFIG_IPV6)
+               int off = si->off;
+
+               off -= offsetof(struct bpf_sk_lookup, local_ip6[0]);
+               off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
+               *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
+                                     offsetof(struct bpf_sk_lookup_kern, v6.daddr));
+               *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+               *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
+#else
+               *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+               break;
        }
+       case offsetof(struct bpf_sk_lookup, remote_port):
+               *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+                                     bpf_target_off(struct bpf_sk_lookup_kern,
+                                                    sport, 2, target_size));
+               break;
+
+       case offsetof(struct bpf_sk_lookup, local_port):
+               *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+                                     bpf_target_off(struct bpf_sk_lookup_kern,
+                                                    dport, 2, target_size));
+               break;
+       }
+
+       return insn - insn_buf;
 }
+
+const struct bpf_prog_ops sk_lookup_prog_ops = {
+};
+
+const struct bpf_verifier_ops sk_lookup_verifier_ops = {
+       .get_func_proto         = sk_lookup_func_proto,
+       .is_valid_access        = sk_lookup_is_valid_access,
+       .convert_ctx_access     = sk_lookup_convert_ctx_access,
+};
+
+#endif /* CONFIG_INET */
+
+DEFINE_BPF_DISPATCHER(xdp)
+
+void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
+{
+       bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
+}
+
+#ifdef CONFIG_DEBUG_INFO_BTF
+BTF_ID_LIST_GLOBAL(btf_sock_ids)
+#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type)
+BTF_SOCK_TYPE_xxx
+#undef BTF_SOCK_TYPE
+#else
+u32 btf_sock_ids[MAX_BTF_SOCK_TYPE];
 #endif
 
 static bool check_arg_btf_id(u32 btf_id, u32 arg)
index 2bbaaf0..4eb4cd8 100644 (file)
@@ -246,6 +246,21 @@ static inline int compute_score(struct sock *sk, struct net *net,
        return score;
 }
 
+static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk,
+                                           struct sk_buff *skb, int doff,
+                                           __be32 saddr, __be16 sport,
+                                           __be32 daddr, unsigned short hnum)
+{
+       struct sock *reuse_sk = NULL;
+       u32 phash;
+
+       if (sk->sk_reuseport) {
+               phash = inet_ehashfn(net, daddr, hnum, saddr, sport);
+               reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
+       }
+       return reuse_sk;
+}
+
 /*
  * Here are some nice properties to exploit here. The BSD API
  * does not allow a listening sock to specify the remote port nor the
@@ -265,21 +280,17 @@ static struct sock *inet_lhash2_lookup(struct net *net,
        struct inet_connection_sock *icsk;
        struct sock *sk, *result = NULL;
        int score, hiscore = 0;
-       u32 phash = 0;
 
        inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
                sk = (struct sock *)icsk;
                score = compute_score(sk, net, hnum, daddr,
                                      dif, sdif, exact_dif);
                if (score > hiscore) {
-                       if (sk->sk_reuseport) {
-                               phash = inet_ehashfn(net, daddr, hnum,
-                                                    saddr, sport);
-                               result = reuseport_select_sock(sk, phash,
-                                                              skb, doff);
-                               if (result)
-                                       return result;
-                       }
+                       result = lookup_reuseport(net, sk, skb, doff,
+                                                 saddr, sport, daddr, hnum);
+                       if (result)
+                               return result;
+
                        result = sk;
                        hiscore = score;
                }
@@ -288,6 +299,29 @@ static struct sock *inet_lhash2_lookup(struct net *net,
        return result;
 }
 
+static inline struct sock *inet_lookup_run_bpf(struct net *net,
+                                              struct inet_hashinfo *hashinfo,
+                                              struct sk_buff *skb, int doff,
+                                              __be32 saddr, __be16 sport,
+                                              __be32 daddr, u16 hnum)
+{
+       struct sock *sk, *reuse_sk;
+       bool no_reuseport;
+
+       if (hashinfo != &tcp_hashinfo)
+               return NULL; /* only TCP is supported */
+
+       no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP,
+                                           saddr, sport, daddr, hnum, &sk);
+       if (no_reuseport || IS_ERR_OR_NULL(sk))
+               return sk;
+
+       reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum);
+       if (reuse_sk)
+               sk = reuse_sk;
+       return sk;
+}
+
 struct sock *__inet_lookup_listener(struct net *net,
                                    struct inet_hashinfo *hashinfo,
                                    struct sk_buff *skb, int doff,
@@ -299,6 +333,14 @@ struct sock *__inet_lookup_listener(struct net *net,
        struct sock *result = NULL;
        unsigned int hash2;
 
+       /* Lookup redirect from BPF */
+       if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
+               result = inet_lookup_run_bpf(net, hashinfo, skb, doff,
+                                            saddr, sport, daddr, hnum);
+               if (result)
+                       goto done;
+       }
+
        hash2 = ipv4_portaddr_hash(net, daddr, hnum);
        ilb2 = inet_lhash2_bucket(hashinfo, hash2);
 
index cd81b6e..daa39d3 100644 (file)
@@ -76,6 +76,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/inetdevice.h>
+#include <linux/btf_ids.h>
 
 #include <crypto/hash.h>
 #include <linux/scatterlist.h>
@@ -2946,7 +2947,7 @@ static void bpf_iter_fini_tcp(void *priv_data)
        bpf_iter_fini_seq_net(priv_data);
 }
 
-static const struct bpf_iter_reg tcp_reg_info = {
+static struct bpf_iter_reg tcp_reg_info = {
        .target                 = "tcp",
        .seq_ops                = &bpf_iter_tcp_seq_ops,
        .init_seq_private       = bpf_iter_init_tcp,
@@ -2961,6 +2962,7 @@ static const struct bpf_iter_reg tcp_reg_info = {
 
 static void __init bpf_iter_register(void)
 {
+       tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
        if (bpf_iter_reg_target(&tcp_reg_info))
                pr_warn("Warning: could not register bpf iterator tcp\n");
 }
index d4be447..bb95cdd 100644 (file)
 #include <net/xfrm.h>
 #include <trace/events/udp.h>
 #include <linux/static_key.h>
+#include <linux/btf_ids.h>
 #include <trace/events/skb.h>
 #include <net/busy_poll.h>
 #include "udp_impl.h"
@@ -408,6 +409,25 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
                              udp_ehash_secret + net_hash_mix(net));
 }
 
+static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk,
+                                           struct sk_buff *skb,
+                                           __be32 saddr, __be16 sport,
+                                           __be32 daddr, unsigned short hnum)
+{
+       struct sock *reuse_sk = NULL;
+       u32 hash;
+
+       if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) {
+               hash = udp_ehashfn(net, daddr, hnum, saddr, sport);
+               reuse_sk = reuseport_select_sock(sk, hash, skb,
+                                                sizeof(struct udphdr));
+               /* Fall back to scoring if group has connections */
+               if (reuseport_has_conns(sk, false))
+                       return NULL;
+       }
+       return reuse_sk;
+}
+
 /* called with rcu_read_lock() */
 static struct sock *udp4_lib_lookup2(struct net *net,
                                     __be32 saddr, __be16 sport,
@@ -418,7 +438,6 @@ static struct sock *udp4_lib_lookup2(struct net *net,
 {
        struct sock *sk, *result;
        int score, badness;
-       u32 hash = 0;
 
        result = NULL;
        badness = 0;
@@ -426,15 +445,11 @@ static struct sock *udp4_lib_lookup2(struct net *net,
                score = compute_score(sk, net, saddr, sport,
                                      daddr, hnum, dif, sdif);
                if (score > badness) {
-                       if (sk->sk_reuseport &&
-                           sk->sk_state != TCP_ESTABLISHED) {
-                               hash = udp_ehashfn(net, daddr, hnum,
-                                                  saddr, sport);
-                               result = reuseport_select_sock(sk, hash, skb,
-                                                       sizeof(struct udphdr));
-                               if (result && !reuseport_has_conns(sk, false))
-                                       return result;
-                       }
+                       result = lookup_reuseport(net, sk, skb,
+                                                 saddr, sport, daddr, hnum);
+                       if (result)
+                               return result;
+
                        badness = score;
                        result = sk;
                }
@@ -442,6 +457,29 @@ static struct sock *udp4_lib_lookup2(struct net *net,
        return result;
 }
 
+static inline struct sock *udp4_lookup_run_bpf(struct net *net,
+                                              struct udp_table *udptable,
+                                              struct sk_buff *skb,
+                                              __be32 saddr, __be16 sport,
+                                              __be32 daddr, u16 hnum)
+{
+       struct sock *sk, *reuse_sk;
+       bool no_reuseport;
+
+       if (udptable != &udp_table)
+               return NULL; /* only UDP is supported */
+
+       no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP,
+                                           saddr, sport, daddr, hnum, &sk);
+       if (no_reuseport || IS_ERR_OR_NULL(sk))
+               return sk;
+
+       reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum);
+       if (reuse_sk)
+               sk = reuse_sk;
+       return sk;
+}
+
 /* UDP is nearly always wildcards out the wazoo, it makes no sense to try
  * harder than this. -DaveM
  */
@@ -449,27 +487,45 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
                __be16 sport, __be32 daddr, __be16 dport, int dif,
                int sdif, struct udp_table *udptable, struct sk_buff *skb)
 {
-       struct sock *result;
        unsigned short hnum = ntohs(dport);
        unsigned int hash2, slot2;
        struct udp_hslot *hslot2;
+       struct sock *result, *sk;
 
        hash2 = ipv4_portaddr_hash(net, daddr, hnum);
        slot2 = hash2 & udptable->mask;
        hslot2 = &udptable->hash2[slot2];
 
+       /* Lookup connected or non-wildcard socket */
        result = udp4_lib_lookup2(net, saddr, sport,
                                  daddr, hnum, dif, sdif,
                                  hslot2, skb);
-       if (!result) {
-               hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
-               slot2 = hash2 & udptable->mask;
-               hslot2 = &udptable->hash2[slot2];
-
-               result = udp4_lib_lookup2(net, saddr, sport,
-                                         htonl(INADDR_ANY), hnum, dif, sdif,
-                                         hslot2, skb);
+       if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED)
+               goto done;
+
+       /* Lookup redirect from BPF */
+       if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
+               sk = udp4_lookup_run_bpf(net, udptable, skb,
+                                        saddr, sport, daddr, hnum);
+               if (sk) {
+                       result = sk;
+                       goto done;
+               }
        }
+
+       /* Got non-wildcard socket or error on first lookup */
+       if (result)
+               goto done;
+
+       /* Lookup wildcard sockets */
+       hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+       slot2 = hash2 & udptable->mask;
+       hslot2 = &udptable->hash2[slot2];
+
+       result = udp4_lib_lookup2(net, saddr, sport,
+                                 htonl(INADDR_ANY), hnum, dif, sdif,
+                                 hslot2, skb);
+done:
        if (IS_ERR(result))
                return NULL;
        return result;
@@ -3153,7 +3209,7 @@ static void bpf_iter_fini_udp(void *priv_data)
        bpf_iter_fini_seq_net(priv_data);
 }
 
-static const struct bpf_iter_reg udp_reg_info = {
+static struct bpf_iter_reg udp_reg_info = {
        .target                 = "udp",
        .seq_ops                = &bpf_iter_udp_seq_ops,
        .init_seq_private       = bpf_iter_init_udp,
@@ -3168,6 +3224,7 @@ static const struct bpf_iter_reg udp_reg_info = {
 
 static void __init bpf_iter_register(void)
 {
+       udp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UDP];
        if (bpf_iter_reg_target(&udp_reg_info))
                pr_warn("Warning: could not register bpf iterator udp\n");
 }
index fbe9d42..2d3add9 100644 (file)
@@ -21,6 +21,8 @@
 #include <net/ip.h>
 #include <net/sock_reuseport.h>
 
+extern struct inet_hashinfo tcp_hashinfo;
+
 u32 inet6_ehashfn(const struct net *net,
                  const struct in6_addr *laddr, const u16 lport,
                  const struct in6_addr *faddr, const __be16 fport)
@@ -111,6 +113,23 @@ static inline int compute_score(struct sock *sk, struct net *net,
        return score;
 }
 
+static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk,
+                                           struct sk_buff *skb, int doff,
+                                           const struct in6_addr *saddr,
+                                           __be16 sport,
+                                           const struct in6_addr *daddr,
+                                           unsigned short hnum)
+{
+       struct sock *reuse_sk = NULL;
+       u32 phash;
+
+       if (sk->sk_reuseport) {
+               phash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
+               reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
+       }
+       return reuse_sk;
+}
+
 /* called with rcu_read_lock() */
 static struct sock *inet6_lhash2_lookup(struct net *net,
                struct inet_listen_hashbucket *ilb2,
@@ -123,21 +142,17 @@ static struct sock *inet6_lhash2_lookup(struct net *net,
        struct inet_connection_sock *icsk;
        struct sock *sk, *result = NULL;
        int score, hiscore = 0;
-       u32 phash = 0;
 
        inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
                sk = (struct sock *)icsk;
                score = compute_score(sk, net, hnum, daddr, dif, sdif,
                                      exact_dif);
                if (score > hiscore) {
-                       if (sk->sk_reuseport) {
-                               phash = inet6_ehashfn(net, daddr, hnum,
-                                                     saddr, sport);
-                               result = reuseport_select_sock(sk, phash,
-                                                              skb, doff);
-                               if (result)
-                                       return result;
-                       }
+                       result = lookup_reuseport(net, sk, skb, doff,
+                                                 saddr, sport, daddr, hnum);
+                       if (result)
+                               return result;
+
                        result = sk;
                        hiscore = score;
                }
@@ -146,6 +161,31 @@ static struct sock *inet6_lhash2_lookup(struct net *net,
        return result;
 }
 
+static inline struct sock *inet6_lookup_run_bpf(struct net *net,
+                                               struct inet_hashinfo *hashinfo,
+                                               struct sk_buff *skb, int doff,
+                                               const struct in6_addr *saddr,
+                                               const __be16 sport,
+                                               const struct in6_addr *daddr,
+                                               const u16 hnum)
+{
+       struct sock *sk, *reuse_sk;
+       bool no_reuseport;
+
+       if (hashinfo != &tcp_hashinfo)
+               return NULL; /* only TCP is supported */
+
+       no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_TCP,
+                                           saddr, sport, daddr, hnum, &sk);
+       if (no_reuseport || IS_ERR_OR_NULL(sk))
+               return sk;
+
+       reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum);
+       if (reuse_sk)
+               sk = reuse_sk;
+       return sk;
+}
+
 struct sock *inet6_lookup_listener(struct net *net,
                struct inet_hashinfo *hashinfo,
                struct sk_buff *skb, int doff,
@@ -157,6 +197,14 @@ struct sock *inet6_lookup_listener(struct net *net,
        struct sock *result = NULL;
        unsigned int hash2;
 
+       /* Lookup redirect from BPF */
+       if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
+               result = inet6_lookup_run_bpf(net, hashinfo, skb, doff,
+                                             saddr, sport, daddr, hnum);
+               if (result)
+                       goto done;
+       }
+
        hash2 = ipv6_portaddr_hash(net, daddr, hnum);
        ilb2 = inet_lhash2_bucket(hashinfo, hash2);
 
index 427b81c..33f5efb 100644 (file)
@@ -61,6 +61,7 @@
 #include <net/l3mdev.h>
 #include <net/ip.h>
 #include <linux/uaccess.h>
+#include <linux/btf_ids.h>
 
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
@@ -6423,7 +6424,10 @@ void __init ip6_route_init_special_entries(void)
 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
 DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
 
-static const struct bpf_iter_reg ipv6_route_reg_info = {
+BTF_ID_LIST(btf_fib6_info_id)
+BTF_ID(struct, fib6_info)
+
+static struct bpf_iter_reg ipv6_route_reg_info = {
        .target                 = "ipv6_route",
        .seq_ops                = &ipv6_route_seq_ops,
        .init_seq_private       = bpf_iter_init_seq_net,
@@ -6438,6 +6442,7 @@ static const struct bpf_iter_reg ipv6_route_reg_info = {
 
 static int __init bpf_iter_register(void)
 {
+       ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id;
        return bpf_iter_reg_target(&ipv6_route_reg_info);
 }
 
index 5aff085..7c1143f 100644 (file)
@@ -141,6 +141,27 @@ static int compute_score(struct sock *sk, struct net *net,
        return score;
 }
 
+static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk,
+                                           struct sk_buff *skb,
+                                           const struct in6_addr *saddr,
+                                           __be16 sport,
+                                           const struct in6_addr *daddr,
+                                           unsigned int hnum)
+{
+       struct sock *reuse_sk = NULL;
+       u32 hash;
+
+       if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) {
+               hash = udp6_ehashfn(net, daddr, hnum, saddr, sport);
+               reuse_sk = reuseport_select_sock(sk, hash, skb,
+                                                sizeof(struct udphdr));
+               /* Fall back to scoring if group has connections */
+               if (reuseport_has_conns(sk, false))
+                       return NULL;
+       }
+       return reuse_sk;
+}
+
 /* called with rcu_read_lock() */
 static struct sock *udp6_lib_lookup2(struct net *net,
                const struct in6_addr *saddr, __be16 sport,
@@ -150,7 +171,6 @@ static struct sock *udp6_lib_lookup2(struct net *net,
 {
        struct sock *sk, *result;
        int score, badness;
-       u32 hash = 0;
 
        result = NULL;
        badness = -1;
@@ -158,16 +178,11 @@ static struct sock *udp6_lib_lookup2(struct net *net,
                score = compute_score(sk, net, saddr, sport,
                                      daddr, hnum, dif, sdif);
                if (score > badness) {
-                       if (sk->sk_reuseport &&
-                           sk->sk_state != TCP_ESTABLISHED) {
-                               hash = udp6_ehashfn(net, daddr, hnum,
-                                                   saddr, sport);
-
-                               result = reuseport_select_sock(sk, hash, skb,
-                                                       sizeof(struct udphdr));
-                               if (result && !reuseport_has_conns(sk, false))
-                                       return result;
-                       }
+                       result = lookup_reuseport(net, sk, skb,
+                                                 saddr, sport, daddr, hnum);
+                       if (result)
+                               return result;
+
                        result = sk;
                        badness = score;
                }
@@ -175,6 +190,31 @@ static struct sock *udp6_lib_lookup2(struct net *net,
        return result;
 }
 
+static inline struct sock *udp6_lookup_run_bpf(struct net *net,
+                                              struct udp_table *udptable,
+                                              struct sk_buff *skb,
+                                              const struct in6_addr *saddr,
+                                              __be16 sport,
+                                              const struct in6_addr *daddr,
+                                              u16 hnum)
+{
+       struct sock *sk, *reuse_sk;
+       bool no_reuseport;
+
+       if (udptable != &udp_table)
+               return NULL; /* only UDP is supported */
+
+       no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_UDP,
+                                           saddr, sport, daddr, hnum, &sk);
+       if (no_reuseport || IS_ERR_OR_NULL(sk))
+               return sk;
+
+       reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum);
+       if (reuse_sk)
+               sk = reuse_sk;
+       return sk;
+}
+
 /* rcu_read_lock() must be held */
 struct sock *__udp6_lib_lookup(struct net *net,
                               const struct in6_addr *saddr, __be16 sport,
@@ -185,25 +225,42 @@ struct sock *__udp6_lib_lookup(struct net *net,
        unsigned short hnum = ntohs(dport);
        unsigned int hash2, slot2;
        struct udp_hslot *hslot2;
-       struct sock *result;
+       struct sock *result, *sk;
 
        hash2 = ipv6_portaddr_hash(net, daddr, hnum);
        slot2 = hash2 & udptable->mask;
        hslot2 = &udptable->hash2[slot2];
 
+       /* Lookup connected or non-wildcard sockets */
        result = udp6_lib_lookup2(net, saddr, sport,
                                  daddr, hnum, dif, sdif,
                                  hslot2, skb);
-       if (!result) {
-               hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
-               slot2 = hash2 & udptable->mask;
+       if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED)
+               goto done;
+
+       /* Lookup redirect from BPF */
+       if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
+               sk = udp6_lookup_run_bpf(net, udptable, skb,
+                                        saddr, sport, daddr, hnum);
+               if (sk) {
+                       result = sk;
+                       goto done;
+               }
+       }
 
-               hslot2 = &udptable->hash2[slot2];
+       /* Got non-wildcard socket or error on first lookup */
+       if (result)
+               goto done;
 
-               result = udp6_lib_lookup2(net, saddr, sport,
-                                         &in6addr_any, hnum, dif, sdif,
-                                         hslot2, skb);
-       }
+       /* Lookup wildcard sockets */
+       hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
+       slot2 = hash2 & udptable->mask;
+       hslot2 = &udptable->hash2[slot2];
+
+       result = udp6_lib_lookup2(net, saddr, sport,
+                                 &in6addr_any, hnum, dif, sdif,
+                                 hslot2, skb);
+done:
        if (IS_ERR(result))
                return NULL;
        return result;
index 4f2c3b1..3cd58f0 100644 (file)
@@ -60,6 +60,7 @@
 #include <linux/genetlink.h>
 #include <linux/net_namespace.h>
 #include <linux/nospec.h>
+#include <linux/btf_ids.h>
 
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
@@ -2803,7 +2804,10 @@ static const struct rhashtable_params netlink_rhashtable_params = {
 };
 
 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
-static const struct bpf_iter_reg netlink_reg_info = {
+BTF_ID_LIST(btf_netlink_sock_id)
+BTF_ID(struct, netlink_sock)
+
+static struct bpf_iter_reg netlink_reg_info = {
        .target                 = "netlink",
        .seq_ops                = &netlink_seq_ops,
        .init_seq_private       = bpf_iter_init_seq_net,
@@ -2818,6 +2822,7 @@ static const struct bpf_iter_reg netlink_reg_info = {
 
 static int __init bpf_iter_register(void)
 {
+       netlink_reg_info.ctx_arg_info[0].btf_id = *btf_netlink_sock_id;
        return bpf_iter_reg_target(&netlink_reg_info);
 }
 #endif
index d459f73..e74ee1c 100644 (file)
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
-#define _(P) ({typeof(P) val; bpf_probe_read(&val, sizeof(val), &P); val;})
+#define _(P)                                                                   \
+       ({                                                                     \
+               typeof(P) val;                                                 \
+               bpf_probe_read_kernel(&val, sizeof(val), &(P));                \
+               val;                                                           \
+       })
 
 #define MINBLOCK_US    1
 
index 8b811c2..f6d593e 100644 (file)
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
-#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
+#define _(P)                                                                   \
+       ({                                                                     \
+               typeof(P) val = 0;                                             \
+               bpf_probe_read_kernel(&val, sizeof(val), &(P));                \
+               val;                                                           \
+       })
 
 SEC("kprobe/__set_task_comm")
 int prog(struct pt_regs *ctx)
@@ -25,8 +30,9 @@ int prog(struct pt_regs *ctx)
        tsk = (void *)PT_REGS_PARM1(ctx);
 
        pid = _(tsk->pid);
-       bpf_probe_read(oldcomm, sizeof(oldcomm), &tsk->comm);
-       bpf_probe_read(newcomm, sizeof(newcomm), (void *)PT_REGS_PARM2(ctx));
+       bpf_probe_read_kernel(oldcomm, sizeof(oldcomm), &tsk->comm);
+       bpf_probe_read_kernel(newcomm, sizeof(newcomm),
+                             (void *)PT_REGS_PARM2(ctx));
        signal = _(tsk->signal);
        oom_score_adj = _(signal->oom_score_adj);
        return 0;
index 8e2610e..3f4599c 100644 (file)
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
-#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
+#define _(P)                                                                   \
+       ({                                                                     \
+               typeof(P) val = 0;                                             \
+               bpf_probe_read_kernel(&val, sizeof(val), &(P));                \
+               val;                                                           \
+       })
 
 /* kprobe is NOT a stable ABI
  * kernel functions can be removed, renamed or completely change semantics.
@@ -34,7 +39,7 @@ int bpf_prog1(struct pt_regs *ctx)
        dev = _(skb->dev);
        len = _(skb->len);
 
-       bpf_probe_read(devname, sizeof(devname), dev->name);
+       bpf_probe_read_kernel(devname, sizeof(devname), dev->name);
 
        if (devname[0] == 'l' && devname[1] == 'o') {
                char fmt[] = "skb %p len %d\n";
index 32b49e8..64a1f75 100644 (file)
@@ -47,7 +47,7 @@ PROG(SYS__NR_write)(struct pt_regs *ctx)
 {
        struct seccomp_data sd;
 
-       bpf_probe_read(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx));
+       bpf_probe_read_kernel(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx));
        if (sd.args[2] == 512) {
                char fmt[] = "write(fd=%d, buf=%p, size=%d)\n";
                bpf_trace_printk(fmt, sizeof(fmt),
@@ -60,7 +60,7 @@ PROG(SYS__NR_read)(struct pt_regs *ctx)
 {
        struct seccomp_data sd;
 
-       bpf_probe_read(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx));
+       bpf_probe_read_kernel(&sd, sizeof(sd), (void *)PT_REGS_PARM2(ctx));
        if (sd.args[2] > 128 && sd.args[2] <= 1024) {
                char fmt[] = "read(fd=%d, buf=%p, size=%d)\n";
                bpf_trace_printk(fmt, sizeof(fmt),
index 2baf8db..8255025 100644 (file)
@@ -21,7 +21,7 @@
 struct {
        __uint(type, BPF_MAP_TYPE_CPUMAP);
        __uint(key_size, sizeof(u32));
-       __uint(value_size, sizeof(u32));
+       __uint(value_size, sizeof(struct bpf_cpumap_val));
        __uint(max_entries, MAX_CPUS);
 } cpu_map SEC(".maps");
 
@@ -30,6 +30,9 @@ struct datarec {
        __u64 processed;
        __u64 dropped;
        __u64 issue;
+       __u64 xdp_pass;
+       __u64 xdp_drop;
+       __u64 xdp_redirect;
 };
 
 /* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
@@ -692,13 +695,16 @@ int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
  * Code in:         kernel/include/trace/events/xdp.h
  */
 struct cpumap_kthread_ctx {
-       u64 __pad;              // First 8 bytes are not accessible by bpf code
-       int map_id;             //      offset:8;  size:4; signed:1;
-       u32 act;                //      offset:12; size:4; signed:0;
-       int cpu;                //      offset:16; size:4; signed:1;
-       unsigned int drops;     //      offset:20; size:4; signed:0;
-       unsigned int processed; //      offset:24; size:4; signed:0;
-       int sched;              //      offset:28; size:4; signed:1;
+       u64 __pad;                      // First 8 bytes are not accessible
+       int map_id;                     //      offset:8;  size:4; signed:1;
+       u32 act;                        //      offset:12; size:4; signed:0;
+       int cpu;                        //      offset:16; size:4; signed:1;
+       unsigned int drops;             //      offset:20; size:4; signed:0;
+       unsigned int processed;         //      offset:24; size:4; signed:0;
+       int sched;                      //      offset:28; size:4; signed:1;
+       unsigned int xdp_pass;          //      offset:32; size:4; signed:0;
+       unsigned int xdp_drop;          //      offset:36; size:4; signed:0;
+       unsigned int xdp_redirect;      //      offset:40; size:4; signed:0;
 };
 
 SEC("tracepoint/xdp/xdp_cpumap_kthread")
@@ -712,6 +718,9 @@ int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
                return 0;
        rec->processed += ctx->processed;
        rec->dropped   += ctx->drops;
+       rec->xdp_pass  += ctx->xdp_pass;
+       rec->xdp_drop  += ctx->xdp_drop;
+       rec->xdp_redirect  += ctx->xdp_redirect;
 
        /* Count times kthread yielded CPU via schedule call */
        if (ctx->sched)
index f4e755e..004c062 100644 (file)
@@ -70,6 +70,11 @@ static const struct option long_options[] = {
        {"stress-mode", no_argument,            NULL, 'x' },
        {"no-separators", no_argument,          NULL, 'z' },
        {"force",       no_argument,            NULL, 'F' },
+       {"mprog-disable", no_argument,          NULL, 'n' },
+       {"mprog-name",  required_argument,      NULL, 'e' },
+       {"mprog-filename", required_argument,   NULL, 'f' },
+       {"redirect-device", required_argument,  NULL, 'r' },
+       {"redirect-map", required_argument,     NULL, 'm' },
        {0, 0, NULL,  0 }
 };
 
@@ -156,6 +161,9 @@ struct datarec {
        __u64 processed;
        __u64 dropped;
        __u64 issue;
+       __u64 xdp_pass;
+       __u64 xdp_drop;
+       __u64 xdp_redirect;
 };
 struct record {
        __u64 timestamp;
@@ -175,6 +183,9 @@ static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
        /* For percpu maps, userspace gets a value per possible CPU */
        unsigned int nr_cpus = bpf_num_possible_cpus();
        struct datarec values[nr_cpus];
+       __u64 sum_xdp_redirect = 0;
+       __u64 sum_xdp_pass = 0;
+       __u64 sum_xdp_drop = 0;
        __u64 sum_processed = 0;
        __u64 sum_dropped = 0;
        __u64 sum_issue = 0;
@@ -196,10 +207,19 @@ static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
                sum_dropped        += values[i].dropped;
                rec->cpu[i].issue = values[i].issue;
                sum_issue        += values[i].issue;
+               rec->cpu[i].xdp_pass = values[i].xdp_pass;
+               sum_xdp_pass += values[i].xdp_pass;
+               rec->cpu[i].xdp_drop = values[i].xdp_drop;
+               sum_xdp_drop += values[i].xdp_drop;
+               rec->cpu[i].xdp_redirect = values[i].xdp_redirect;
+               sum_xdp_redirect += values[i].xdp_redirect;
        }
        rec->total.processed = sum_processed;
        rec->total.dropped   = sum_dropped;
        rec->total.issue     = sum_issue;
+       rec->total.xdp_pass  = sum_xdp_pass;
+       rec->total.xdp_drop  = sum_xdp_drop;
+       rec->total.xdp_redirect = sum_xdp_redirect;
        return true;
 }
 
@@ -300,17 +320,33 @@ static __u64 calc_errs_pps(struct datarec *r,
        return pps;
 }
 
+static void calc_xdp_pps(struct datarec *r, struct datarec *p,
+                        double *xdp_pass, double *xdp_drop,
+                        double *xdp_redirect, double period_)
+{
+       *xdp_pass = 0, *xdp_drop = 0, *xdp_redirect = 0;
+       if (period_ > 0) {
+               *xdp_redirect = (r->xdp_redirect - p->xdp_redirect) / period_;
+               *xdp_pass = (r->xdp_pass - p->xdp_pass) / period_;
+               *xdp_drop = (r->xdp_drop - p->xdp_drop) / period_;
+       }
+}
+
 static void stats_print(struct stats_record *stats_rec,
                        struct stats_record *stats_prev,
-                       char *prog_name)
+                       char *prog_name, char *mprog_name, int mprog_fd)
 {
        unsigned int nr_cpus = bpf_num_possible_cpus();
        double pps = 0, drop = 0, err = 0;
+       bool mprog_enabled = false;
        struct record *rec, *prev;
        int to_cpu;
        double t;
        int i;
 
+       if (mprog_fd > 0)
+               mprog_enabled = true;
+
        /* Header */
        printf("Running XDP/eBPF prog_name:%s\n", prog_name);
        printf("%-15s %-7s %-14s %-11s %-9s\n",
@@ -455,6 +491,34 @@ static void stats_print(struct stats_record *stats_rec,
                printf(fm2_err, "xdp_exception", "total", pps, drop);
        }
 
+       /* CPUMAP attached XDP program that runs on remote/destination CPU */
+       if (mprog_enabled) {
+               char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f\n";
+               char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f\n";
+               double xdp_pass, xdp_drop, xdp_redirect;
+
+               printf("\n2nd remote XDP/eBPF prog_name: %s\n", mprog_name);
+               printf("%-15s %-7s %-14s %-11s %-9s\n",
+                      "XDP-cpumap", "CPU:to", "xdp-pass", "xdp-drop", "xdp-redir");
+
+               rec  = &stats_rec->kthread;
+               prev = &stats_prev->kthread;
+               t = calc_period(rec, prev);
+               for (i = 0; i < nr_cpus; i++) {
+                       struct datarec *r = &rec->cpu[i];
+                       struct datarec *p = &prev->cpu[i];
+
+                       calc_xdp_pps(r, p, &xdp_pass, &xdp_drop,
+                                    &xdp_redirect, t);
+                       if (xdp_pass > 0 || xdp_drop > 0 || xdp_redirect > 0)
+                               printf(fmt_k, "xdp-in-kthread", i, xdp_pass, xdp_drop,
+                                      xdp_redirect);
+               }
+               calc_xdp_pps(&rec->total, &prev->total, &xdp_pass, &xdp_drop,
+                            &xdp_redirect, t);
+               printf(fm2_k, "xdp-in-kthread", "total", xdp_pass, xdp_drop, xdp_redirect);
+       }
+
        printf("\n");
        fflush(stdout);
 }
@@ -491,7 +555,7 @@ static inline void swap(struct stats_record **a, struct stats_record **b)
        *b = tmp;
 }
 
-static int create_cpu_entry(__u32 cpu, __u32 queue_size,
+static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value,
                            __u32 avail_idx, bool new)
 {
        __u32 curr_cpus_count = 0;
@@ -501,7 +565,7 @@ static int create_cpu_entry(__u32 cpu, __u32 queue_size,
        /* Add a CPU entry to cpumap, as this allocate a cpu entry in
         * the kernel for the cpu.
         */
-       ret = bpf_map_update_elem(cpu_map_fd, &cpu, &queue_size, 0);
+       ret = bpf_map_update_elem(cpu_map_fd, &cpu, value, 0);
        if (ret) {
                fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret);
                exit(EXIT_FAIL_BPF);
@@ -532,9 +596,9 @@ static int create_cpu_entry(__u32 cpu, __u32 queue_size,
                }
        }
        /* map_fd[7] = cpus_iterator */
-       printf("%s CPU:%u as idx:%u queue_size:%d (total cpus_count:%u)\n",
+       printf("%s CPU:%u as idx:%u qsize:%d prog_fd: %d (cpus_count:%u)\n",
               new ? "Add-new":"Replace", cpu, avail_idx,
-              queue_size, curr_cpus_count);
+              value->qsize, value->bpf_prog.fd, curr_cpus_count);
 
        return 0;
 }
@@ -558,21 +622,26 @@ static void mark_cpus_unavailable(void)
 }
 
 /* Stress cpumap management code by concurrently changing underlying cpumap */
-static void stress_cpumap(void)
+static void stress_cpumap(struct bpf_cpumap_val *value)
 {
        /* Changing qsize will cause kernel to free and alloc a new
         * bpf_cpu_map_entry, with an associated/complicated tear-down
         * procedure.
         */
-       create_cpu_entry(1,  1024, 0, false);
-       create_cpu_entry(1,     8, 0, false);
-       create_cpu_entry(1, 16000, 0, false);
+       value->qsize = 1024;
+       create_cpu_entry(1, value, 0, false);
+       value->qsize = 8;
+       create_cpu_entry(1, value, 0, false);
+       value->qsize = 16000;
+       create_cpu_entry(1, value, 0, false);
 }
 
 static void stats_poll(int interval, bool use_separators, char *prog_name,
+                      char *mprog_name, struct bpf_cpumap_val *value,
                       bool stress_mode)
 {
        struct stats_record *record, *prev;
+       int mprog_fd;
 
        record = alloc_stats_record();
        prev   = alloc_stats_record();
@@ -584,11 +653,12 @@ static void stats_poll(int interval, bool use_separators, char *prog_name,
 
        while (1) {
                swap(&prev, &record);
+               mprog_fd = value->bpf_prog.fd;
                stats_collect(record);
-               stats_print(record, prev, prog_name);
+               stats_print(record, prev, prog_name, mprog_name, mprog_fd);
                sleep(interval);
                if (stress_mode)
-                       stress_cpumap();
+                       stress_cpumap(value);
        }
 
        free_stats_record(record);
@@ -661,15 +731,66 @@ static int init_map_fds(struct bpf_object *obj)
        return 0;
 }
 
+static int load_cpumap_prog(char *file_name, char *prog_name,
+                           char *redir_interface, char *redir_map)
+{
+       struct bpf_prog_load_attr prog_load_attr = {
+               .prog_type              = BPF_PROG_TYPE_XDP,
+               .expected_attach_type   = BPF_XDP_CPUMAP,
+               .file = file_name,
+       };
+       struct bpf_program *prog;
+       struct bpf_object *obj;
+       int fd;
+
+       if (bpf_prog_load_xattr(&prog_load_attr, &obj, &fd))
+               return -1;
+
+       if (fd < 0) {
+               fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n",
+                       strerror(errno));
+               return fd;
+       }
+
+       if (redir_interface && redir_map) {
+               int err, map_fd, ifindex_out, key = 0;
+
+               map_fd = bpf_object__find_map_fd_by_name(obj, redir_map);
+               if (map_fd < 0)
+                       return map_fd;
+
+               ifindex_out = if_nametoindex(redir_interface);
+               if (!ifindex_out)
+                       return -1;
+
+               err = bpf_map_update_elem(map_fd, &key, &ifindex_out, 0);
+               if (err < 0)
+                       return err;
+       }
+
+       prog = bpf_object__find_program_by_title(obj, prog_name);
+       if (!prog) {
+               fprintf(stderr, "bpf_object__find_program_by_title failed\n");
+               return EXIT_FAIL;
+       }
+
+       return bpf_program__fd(prog);
+}
+
 int main(int argc, char **argv)
 {
        struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
        char *prog_name = "xdp_cpu_map5_lb_hash_ip_pairs";
+       char *mprog_filename = "xdp_redirect_kern.o";
+       char *redir_interface = NULL, *redir_map = NULL;
+       char *mprog_name = "xdp_redirect_dummy";
+       bool mprog_disable = false;
        struct bpf_prog_load_attr prog_load_attr = {
                .prog_type      = BPF_PROG_TYPE_UNSPEC,
        };
        struct bpf_prog_info info = {};
        __u32 info_len = sizeof(info);
+       struct bpf_cpumap_val value;
        bool use_separators = true;
        bool stress_mode = false;
        struct bpf_program *prog;
@@ -681,6 +802,7 @@ int main(int argc, char **argv)
        int add_cpu = -1;
        int opt, err;
        int prog_fd;
+       int *cpu, i;
        __u32 qsize;
 
        n_cpus = get_nprocs_conf();
@@ -716,8 +838,15 @@ int main(int argc, char **argv)
        }
        mark_cpus_unavailable();
 
+       cpu = malloc(n_cpus * sizeof(int));
+       if (!cpu) {
+               fprintf(stderr, "failed to allocate cpu array\n");
+               return EXIT_FAIL;
+       }
+       memset(cpu, 0, n_cpus * sizeof(int));
+
        /* Parse commands line args */
-       while ((opt = getopt_long(argc, argv, "hSd:s:p:q:c:xzF",
+       while ((opt = getopt_long(argc, argv, "hSd:s:p:q:c:xzFf:e:r:m:",
                                  long_options, &longindex)) != -1) {
                switch (opt) {
                case 'd':
@@ -751,6 +880,21 @@ int main(int argc, char **argv)
                        /* Selecting eBPF prog to load */
                        prog_name = optarg;
                        break;
+               case 'n':
+                       mprog_disable = true;
+                       break;
+               case 'f':
+                       mprog_filename = optarg;
+                       break;
+               case 'e':
+                       mprog_name = optarg;
+                       break;
+               case 'r':
+                       redir_interface = optarg;
+                       break;
+               case 'm':
+                       redir_map = optarg;
+                       break;
                case 'c':
                        /* Add multiple CPUs */
                        add_cpu = strtoul(optarg, NULL, 0);
@@ -760,8 +904,7 @@ int main(int argc, char **argv)
                                        errno, strerror(errno));
                                goto error;
                        }
-                       create_cpu_entry(add_cpu, qsize, added_cpus, true);
-                       added_cpus++;
+                       cpu[added_cpus++] = add_cpu;
                        break;
                case 'q':
                        qsize = atoi(optarg);
@@ -772,6 +915,7 @@ int main(int argc, char **argv)
                case 'h':
                error:
                default:
+                       free(cpu);
                        usage(argv, obj);
                        return EXIT_FAIL_OPTION;
                }
@@ -784,15 +928,30 @@ int main(int argc, char **argv)
        if (ifindex == -1) {
                fprintf(stderr, "ERR: required option --dev missing\n");
                usage(argv, obj);
-               return EXIT_FAIL_OPTION;
+               err = EXIT_FAIL_OPTION;
+               goto out;
        }
        /* Required option */
        if (add_cpu == -1) {
                fprintf(stderr, "ERR: required option --cpu missing\n");
                fprintf(stderr, " Specify multiple --cpu option to add more\n");
                usage(argv, obj);
-               return EXIT_FAIL_OPTION;
+               err = EXIT_FAIL_OPTION;
+               goto out;
+       }
+
+       value.bpf_prog.fd = 0;
+       if (!mprog_disable)
+               value.bpf_prog.fd = load_cpumap_prog(mprog_filename, mprog_name,
+                                                    redir_interface, redir_map);
+       if (value.bpf_prog.fd < 0) {
+               err = value.bpf_prog.fd;
+               goto out;
        }
+       value.qsize = qsize;
+
+       for (i = 0; i < added_cpus; i++)
+               create_cpu_entry(cpu[i], &value, i, true);
 
        /* Remove XDP program when program is interrupted or killed */
        signal(SIGINT, int_exit);
@@ -801,27 +960,33 @@ int main(int argc, char **argv)
        prog = bpf_object__find_program_by_title(obj, prog_name);
        if (!prog) {
                fprintf(stderr, "bpf_object__find_program_by_title failed\n");
-               return EXIT_FAIL;
+               err = EXIT_FAIL;
+               goto out;
        }
 
        prog_fd = bpf_program__fd(prog);
        if (prog_fd < 0) {
                fprintf(stderr, "bpf_program__fd failed\n");
-               return EXIT_FAIL;
+               err = EXIT_FAIL;
+               goto out;
        }
 
        if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
                fprintf(stderr, "link set xdp fd failed\n");
-               return EXIT_FAIL_XDP;
+               err = EXIT_FAIL_XDP;
+               goto out;
        }
 
        err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
        if (err) {
                printf("can't get prog info - %s\n", strerror(errno));
-               return err;
+               goto out;
        }
        prog_id = info.id;
 
-       stats_poll(interval, use_separators, prog_name, stress_mode);
-       return EXIT_OK;
+       stats_poll(interval, use_separators, prog_name, mprog_name,
+                  &value, stress_mode);
+out:
+       free(cpu);
+       return err;
 }
index 6843376..5bfa448 100755 (executable)
@@ -404,6 +404,7 @@ class PrinterHelpers(Printer):
 
     type_fwds = [
             'struct bpf_fib_lookup',
+            'struct bpf_sk_lookup',
             'struct bpf_perf_event_data',
             'struct bpf_perf_event_value',
             'struct bpf_pidns_info',
@@ -450,6 +451,7 @@ class PrinterHelpers(Printer):
             'struct bpf_perf_event_data',
             'struct bpf_perf_event_value',
             'struct bpf_pidns_info',
+            'struct bpf_sk_lookup',
             'struct bpf_sock',
             'struct bpf_sock_addr',
             'struct bpf_sock_ops',
@@ -487,6 +489,11 @@ class PrinterHelpers(Printer):
             'struct sk_msg_buff': 'struct sk_msg_md',
             'struct xdp_buff': 'struct xdp_md',
     }
+    # Helpers overloaded for different context types.
+    overloaded_helpers = [
+        'bpf_get_socket_cookie',
+        'bpf_sk_assign',
+    ]
 
     def print_header(self):
         header = '''\
@@ -543,7 +550,7 @@ class PrinterHelpers(Printer):
         for i, a in enumerate(proto['args']):
             t = a['type']
             n = a['name']
-            if proto['name'] == 'bpf_get_socket_cookie' and i == 0:
+            if proto['name'] in self.overloaded_helpers and i == 0:
                     t = 'void'
                     n = 'ctx'
             one_arg = '{}{}'.format(comma, self.map_type(t))
index 412ea3d..82e356b 100644 (file)
@@ -45,7 +45,7 @@ PROG COMMANDS
 |               **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** |
 |              **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/sysctl** |
 |              **cgroup/getsockopt** | **cgroup/setsockopt** |
-|              **struct_ops** | **fentry** | **fexit** | **freplace**
+|              **struct_ops** | **fentry** | **fexit** | **freplace** | **sk_lookup**
 |      }
 |       *ATTACH_TYPE* := {
 |              **msg_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector**
index 25b25ac..7b13726 100644 (file)
@@ -479,7 +479,7 @@ _bpftool()
                                 cgroup/post_bind4 cgroup/post_bind6 \
                                 cgroup/sysctl cgroup/getsockopt \
                                 cgroup/setsockopt struct_ops \
-                                fentry fexit freplace" -- \
+                                fentry fexit freplace sk_lookup" -- \
                                                    "$cur" ) )
                             return 0
                             ;;
index 29f4e76..6530366 100644 (file)
@@ -1,10 +1,11 @@
 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
 /* Copyright (C) 2017-2018 Netronome Systems, Inc. */
 
+#define _GNU_SOURCE
 #include <ctype.h>
 #include <errno.h>
 #include <fcntl.h>
-#include <fts.h>
+#include <ftw.h>
 #include <libgen.h>
 #include <mntent.h>
 #include <stdbool.h>
@@ -64,6 +65,7 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = {
        [BPF_TRACE_FEXIT]               = "fexit",
        [BPF_MODIFY_RETURN]             = "mod_ret",
        [BPF_LSM_MAC]                   = "lsm_mac",
+       [BPF_SK_LOOKUP]                 = "sk_lookup",
 };
 
 void p_err(const char *fmt, ...)
@@ -160,24 +162,35 @@ int mount_tracefs(const char *target)
        return err;
 }
 
-int open_obj_pinned(char *path, bool quiet)
+int open_obj_pinned(const char *path, bool quiet)
 {
-       int fd;
+       char *pname;
+       int fd = -1;
+
+       pname = strdup(path);
+       if (!pname) {
+               if (!quiet)
+                       p_err("mem alloc failed");
+               goto out_ret;
+       }
 
-       fd = bpf_obj_get(path);
+       fd = bpf_obj_get(pname);
        if (fd < 0) {
                if (!quiet)
-                       p_err("bpf obj get (%s): %s", path,
-                             errno == EACCES && !is_bpffs(dirname(path)) ?
+                       p_err("bpf obj get (%s): %s", pname,
+                             errno == EACCES && !is_bpffs(dirname(pname)) ?
                            "directory not in bpf file system (bpffs)" :
                            strerror(errno));
-               return -1;
+               goto out_free;
        }
 
+out_free:
+       free(pname);
+out_ret:
        return fd;
 }
 
-int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type)
+int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type)
 {
        enum bpf_obj_type type;
        int fd;
@@ -367,71 +380,82 @@ void print_hex_data_json(uint8_t *data, size_t len)
        jsonw_end_array(json_wtr);
 }
 
+/* extra params for nftw cb */
+static struct pinned_obj_table *build_fn_table;
+static enum bpf_obj_type build_fn_type;
+
+static int do_build_table_cb(const char *fpath, const struct stat *sb,
+                            int typeflag, struct FTW *ftwbuf)
+{
+       struct bpf_prog_info pinned_info;
+       __u32 len = sizeof(pinned_info);
+       struct pinned_obj *obj_node;
+       enum bpf_obj_type objtype;
+       int fd, err = 0;
+
+       if (typeflag != FTW_F)
+               goto out_ret;
+
+       fd = open_obj_pinned(fpath, true);
+       if (fd < 0)
+               goto out_ret;
+
+       objtype = get_fd_type(fd);
+       if (objtype != build_fn_type)
+               goto out_close;
+
+       memset(&pinned_info, 0, sizeof(pinned_info));
+       if (bpf_obj_get_info_by_fd(fd, &pinned_info, &len))
+               goto out_close;
+
+       obj_node = calloc(1, sizeof(*obj_node));
+       if (!obj_node) {
+               err = -1;
+               goto out_close;
+       }
+
+       obj_node->id = pinned_info.id;
+       obj_node->path = strdup(fpath);
+       if (!obj_node->path) {
+               err = -1;
+               free(obj_node);
+               goto out_close;
+       }
+
+       hash_add(build_fn_table->table, &obj_node->hash, obj_node->id);
+out_close:
+       close(fd);
+out_ret:
+       return err;
+}
+
 int build_pinned_obj_table(struct pinned_obj_table *tab,
                           enum bpf_obj_type type)
 {
-       struct bpf_prog_info pinned_info = {};
-       struct pinned_obj *obj_node = NULL;
-       __u32 len = sizeof(pinned_info);
        struct mntent *mntent = NULL;
-       enum bpf_obj_type objtype;
        FILE *mntfile = NULL;
-       FTSENT *ftse = NULL;
-       FTS *fts = NULL;
-       int fd, err;
+       int flags = FTW_PHYS;
+       int nopenfd = 16;
+       int err = 0;
 
        mntfile = setmntent("/proc/mounts", "r");
        if (!mntfile)
                return -1;
 
+       build_fn_table = tab;
+       build_fn_type = type;
+
        while ((mntent = getmntent(mntfile))) {
-               char *path[] = { mntent->mnt_dir, NULL };
+               char *path = mntent->mnt_dir;
 
                if (strncmp(mntent->mnt_type, "bpf", 3) != 0)
                        continue;
-
-               fts = fts_open(path, 0, NULL);
-               if (!fts)
-                       continue;
-
-               while ((ftse = fts_read(fts))) {
-                       if (!(ftse->fts_info & FTS_F))
-                               continue;
-                       fd = open_obj_pinned(ftse->fts_path, true);
-                       if (fd < 0)
-                               continue;
-
-                       objtype = get_fd_type(fd);
-                       if (objtype != type) {
-                               close(fd);
-                               continue;
-                       }
-                       memset(&pinned_info, 0, sizeof(pinned_info));
-                       err = bpf_obj_get_info_by_fd(fd, &pinned_info, &len);
-                       if (err) {
-                               close(fd);
-                               continue;
-                       }
-
-                       obj_node = malloc(sizeof(*obj_node));
-                       if (!obj_node) {
-                               close(fd);
-                               fts_close(fts);
-                               fclose(mntfile);
-                               return -1;
-                       }
-
-                       memset(obj_node, 0, sizeof(*obj_node));
-                       obj_node->id = pinned_info.id;
-                       obj_node->path = strdup(ftse->fts_path);
-                       hash_add(tab->table, &obj_node->hash, obj_node->id);
-
-                       close(fd);
-               }
-               fts_close(fts);
+               err = nftw(path, do_build_table_cb, nopenfd, flags);
+               if (err)
+                       break;
        }
        fclose(mntfile);
-       return 0;
+       return err;
 }
 
 void delete_pinned_obj_table(struct pinned_obj_table *tab)
index b59d26e..8a4c2b3 100644 (file)
@@ -302,8 +302,11 @@ static int do_skeleton(int argc, char **argv)
        opts.object_name = obj_name;
        obj = bpf_object__open_mem(obj_data, file_sz, &opts);
        if (IS_ERR(obj)) {
+               char err_buf[256];
+
+               libbpf_strerror(PTR_ERR(obj), err_buf, sizeof(err_buf));
+               p_err("failed to open BPF object file: %s", err_buf);
                obj = NULL;
-               p_err("failed to open BPF object file: %ld", PTR_ERR(obj));
                goto out;
        }
 
index 78d34e8..e3a79b5 100644 (file)
@@ -152,8 +152,8 @@ int cmd_select(const struct cmd *cmds, int argc, char **argv,
 int get_fd_type(int fd);
 const char *get_fd_type_name(enum bpf_obj_type type);
 char *get_fdinfo(int fd, const char *key);
-int open_obj_pinned(char *path, bool quiet);
-int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type);
+int open_obj_pinned(const char *path, bool quiet);
+int open_obj_pinned_any(const char *path, enum bpf_obj_type exp_type);
 int mount_bpffs_for_pin(const char *name);
 int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(int *, char ***));
 int do_pin_fd(int fd, const char *name);
index 6863c57..3e6ecc6 100644 (file)
@@ -59,6 +59,7 @@ const char * const prog_type_name[] = {
        [BPF_PROG_TYPE_TRACING]                 = "tracing",
        [BPF_PROG_TYPE_STRUCT_OPS]              = "struct_ops",
        [BPF_PROG_TYPE_EXT]                     = "ext",
+       [BPF_PROG_TYPE_SK_LOOKUP]               = "sk_lookup",
 };
 
 const size_t prog_type_name_size = ARRAY_SIZE(prog_type_name);
@@ -1905,7 +1906,7 @@ static int do_help(int argc, char **argv)
                "                 cgroup/getsockname4 | cgroup/getsockname6 | cgroup/sendmsg4 |\n"
                "                 cgroup/sendmsg6 | cgroup/recvmsg4 | cgroup/recvmsg6 |\n"
                "                 cgroup/getsockopt | cgroup/setsockopt |\n"
-               "                 struct_ops | fentry | fexit | freplace }\n"
+               "                 struct_ops | fentry | fexit | freplace | sk_lookup }\n"
                "       ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n"
                "                        flow_dissector }\n"
                "       METRIC := { cycles | instructions | l1d_loads | llc_misses }\n"
index 8468a60..d9b4209 100644 (file)
@@ -71,7 +71,8 @@ int iter(struct bpf_iter__task_file *ctx)
 
        e.pid = task->tgid;
        e.id = get_obj_id(file->private_data, obj_type);
-       bpf_probe_read(&e.comm, sizeof(e.comm), task->group_leader->comm);
+       bpf_probe_read_kernel(&e.comm, sizeof(e.comm),
+                             task->group_leader->comm);
        bpf_seq_write(ctx->meta->seq, &e, sizeof(e));
 
        return 0;
index fe01977..4867d54 100644 (file)
@@ -3,6 +3,8 @@
 #ifndef _LINUX_BTF_IDS_H
 #define _LINUX_BTF_IDS_H
 
+#ifdef CONFIG_DEBUG_INFO_BTF
+
 #include <linux/compiler.h> /* for __PASTE */
 
 /*
@@ -21,7 +23,7 @@
 asm(                                                   \
 ".pushsection " BTF_IDS_SECTION ",\"a\";       \n"     \
 ".local " #symbol " ;                          \n"     \
-".type  " #symbol ", @object;                  \n"     \
+".type  " #symbol ", STT_OBJECT;               \n"     \
 ".size  " #symbol ", 4;                        \n"     \
 #symbol ":                                     \n"     \
 ".zero 4                                       \n"     \
@@ -55,17 +57,20 @@ asm(                                                        \
  * .zero 4
  *
  */
-#define __BTF_ID_LIST(name)                            \
+#define __BTF_ID_LIST(name, scope)                     \
 asm(                                                   \
 ".pushsection " BTF_IDS_SECTION ",\"a\";       \n"     \
-".local " #name ";                             \n"     \
+"." #scope " " #name ";                        \n"     \
 #name ":;                                      \n"     \
 ".popsection;                                  \n");   \
 
 #define BTF_ID_LIST(name)                              \
-__BTF_ID_LIST(name)                                    \
+__BTF_ID_LIST(name, local)                             \
 extern u32 name[];
 
+#define BTF_ID_LIST_GLOBAL(name)                       \
+__BTF_ID_LIST(name, globl)
+
 /*
  * The BTF_ID_UNUSED macro defines 4 zero bytes.
  * It's used when we want to define 'unused' entry
@@ -83,5 +88,43 @@ asm(                                                 \
 ".zero 4                                       \n"     \
 ".popsection;                                  \n");
 
+#else
+
+#define BTF_ID_LIST(name) static u32 name[5];
+#define BTF_ID(prefix, name)
+#define BTF_ID_UNUSED
+#define BTF_ID_LIST_GLOBAL(name) u32 name[1];
+
+#endif /* CONFIG_DEBUG_INFO_BTF */
+
+#ifdef CONFIG_NET
+/* Define a list of socket types which can be the argument for
+ * skc_to_*_sock() helpers. All these sockets should have
+ * sock_common as the first argument in its memory layout.
+ */
+#define BTF_SOCK_TYPE_xxx \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET, inet_sock)                    \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_CONN, inet_connection_sock)    \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_REQ, inet_request_sock)        \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_INET_TW, inet_timewait_sock)        \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_REQ, request_sock)                  \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK, sock)                         \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_SOCK_COMMON, sock_common)           \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP, tcp_sock)                      \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_REQ, tcp_request_sock)          \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP_TW, tcp_timewait_sock)          \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_TCP6, tcp6_sock)                    \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP, udp_sock)                      \
+       BTF_SOCK_TYPE(BTF_SOCK_TYPE_UDP6, udp6_sock)
+
+enum {
+#define BTF_SOCK_TYPE(name, str) name,
+BTF_SOCK_TYPE_xxx
+#undef BTF_SOCK_TYPE
+MAX_BTF_SOCK_TYPE,
+};
+
+extern u32 btf_sock_ids[];
+#endif
 
 #endif
index 5e38638..54d0c88 100644 (file)
@@ -189,6 +189,7 @@ enum bpf_prog_type {
        BPF_PROG_TYPE_STRUCT_OPS,
        BPF_PROG_TYPE_EXT,
        BPF_PROG_TYPE_LSM,
+       BPF_PROG_TYPE_SK_LOOKUP,
 };
 
 enum bpf_attach_type {
@@ -227,6 +228,8 @@ enum bpf_attach_type {
        BPF_CGROUP_INET6_GETSOCKNAME,
        BPF_XDP_DEVMAP,
        BPF_CGROUP_INET_SOCK_RELEASE,
+       BPF_XDP_CPUMAP,
+       BPF_SK_LOOKUP,
        __MAX_BPF_ATTACH_TYPE
 };
 
@@ -2419,7 +2422,7 @@ union bpf_attr {
  *                     Look for an IPv6 socket.
  *
  *             If the *netns* is a negative signed 32-bit integer, then the
- *             socket lookup table in the netns associated with the *ctx* will
+ *             socket lookup table in the netns associated with the *ctx*
  *             will be used. For the TC hooks, this is the netns of the device
  *             in the skb. For socket hooks, this is the netns of the socket.
  *             If *netns* is any other signed 32-bit value greater than or
@@ -2456,7 +2459,7 @@ union bpf_attr {
  *                     Look for an IPv6 socket.
  *
  *             If the *netns* is a negative signed 32-bit integer, then the
- *             socket lookup table in the netns associated with the *ctx* will
+ *             socket lookup table in the netns associated with the *ctx*
  *             will be used. For the TC hooks, this is the netns of the device
  *             in the skb. For socket hooks, this is the netns of the socket.
  *             If *netns* is any other signed 32-bit value greater than or
@@ -3068,6 +3071,10 @@ union bpf_attr {
  *
  * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
  *     Description
+ *             Helper is overloaded depending on BPF program type. This
+ *             description applies to **BPF_PROG_TYPE_SCHED_CLS** and
+ *             **BPF_PROG_TYPE_SCHED_ACT** programs.
+ *
  *             Assign the *sk* to the *skb*. When combined with appropriate
  *             routing configuration to receive the packet towards the socket,
  *             will cause *skb* to be delivered to the specified socket.
@@ -3093,6 +3100,56 @@ union bpf_attr {
  *             **-ESOCKTNOSUPPORT** if the socket type is not supported
  *             (reuseport).
  *
+ * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags)
+ *     Description
+ *             Helper is overloaded depending on BPF program type. This
+ *             description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs.
+ *
+ *             Select the *sk* as a result of a socket lookup.
+ *
+ *             For the operation to succeed passed socket must be compatible
+ *             with the packet description provided by the *ctx* object.
+ *
+ *             L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must
+ *             be an exact match. While IP family (**AF_INET** or
+ *             **AF_INET6**) must be compatible, that is IPv6 sockets
+ *             that are not v6-only can be selected for IPv4 packets.
+ *
+ *             Only TCP listeners and UDP unconnected sockets can be
+ *             selected. *sk* can also be NULL to reset any previous
+ *             selection.
+ *
+ *             *flags* argument can combination of following values:
+ *
+ *             * **BPF_SK_LOOKUP_F_REPLACE** to override the previous
+ *               socket selection, potentially done by a BPF program
+ *               that ran before us.
+ *
+ *             * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip
+ *               load-balancing within reuseport group for the socket
+ *               being selected.
+ *
+ *             On success *ctx->sk* will point to the selected socket.
+ *
+ *     Return
+ *             0 on success, or a negative errno in case of failure.
+ *
+ *             * **-EAFNOSUPPORT** if socket family (*sk->family*) is
+ *               not compatible with packet family (*ctx->family*).
+ *
+ *             * **-EEXIST** if socket has been already selected,
+ *               potentially by another program, and
+ *               **BPF_SK_LOOKUP_F_REPLACE** flag was not specified.
+ *
+ *             * **-EINVAL** if unsupported flags were specified.
+ *
+ *             * **-EPROTOTYPE** if socket L4 protocol
+ *               (*sk->protocol*) doesn't match packet protocol
+ *               (*ctx->protocol*).
+ *
+ *             * **-ESOCKTNOSUPPORT** if socket is not in allowed
+ *               state (TCP listening or UDP unconnected).
+ *
  * u64 bpf_ktime_get_boot_ns(void)
  *     Description
  *             Return the time elapsed since system boot, in nanoseconds.
@@ -3606,6 +3663,12 @@ enum {
        BPF_RINGBUF_HDR_SZ              = 8,
 };
 
+/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */
+enum {
+       BPF_SK_LOOKUP_F_REPLACE         = (1ULL << 0),
+       BPF_SK_LOOKUP_F_NO_REUSEPORT    = (1ULL << 1),
+};
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
        BPF_ADJ_ROOM_NET,
@@ -3849,6 +3912,19 @@ struct bpf_devmap_val {
        } bpf_prog;
 };
 
+/* CPUMAP map-value layout
+ *
+ * The struct data-layout of map-value is a configuration interface.
+ * New members can only be added to the end of this structure.
+ */
+struct bpf_cpumap_val {
+       __u32 qsize;    /* queue size to remote target CPU */
+       union {
+               int   fd;       /* prog fd on map write */
+               __u32 id;       /* prog id on map read */
+       } bpf_prog;
+};
+
 enum sk_action {
        SK_DROP = 0,
        SK_PASS,
@@ -3986,7 +4062,7 @@ struct bpf_link_info {
 
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
  * by user and intended to be used by socket (e.g. to bind to, depends on
- * attach attach type).
+ * attach type).
  */
 struct bpf_sock_addr {
        __u32 user_family;      /* Allows 4-byte read, but no write. */
@@ -4335,4 +4411,19 @@ struct bpf_pidns_info {
        __u32 pid;
        __u32 tgid;
 };
+
+/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
+struct bpf_sk_lookup {
+       __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+
+       __u32 family;           /* Protocol family (AF_INET, AF_INET6) */
+       __u32 protocol;         /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
+       __u32 remote_ip4;       /* Network byte order */
+       __u32 remote_ip6[4];    /* Network byte order */
+       __u32 remote_port;      /* Network byte order */
+       __u32 local_ip4;        /* Network byte order */
+       __u32 local_ip6[4];     /* Network byte order */
+       __u32 local_port;       /* Host byte order */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
index a510d8e..bc14db7 100644 (file)
@@ -40,7 +40,7 @@
  * Helper macro to manipulate data structures
  */
 #ifndef offsetof
-#define offsetof(TYPE, MEMBER)  ((size_t)&((TYPE *)0)->MEMBER)
+#define offsetof(TYPE, MEMBER)  __builtin_offsetof(TYPE, MEMBER)
 #endif
 #ifndef container_of
 #define container_of(ptr, type, member)                                \
index 4489f95..846164c 100644 (file)
@@ -6799,6 +6799,7 @@ BPF_PROG_TYPE_FNS(perf_event, BPF_PROG_TYPE_PERF_EVENT);
 BPF_PROG_TYPE_FNS(tracing, BPF_PROG_TYPE_TRACING);
 BPF_PROG_TYPE_FNS(struct_ops, BPF_PROG_TYPE_STRUCT_OPS);
 BPF_PROG_TYPE_FNS(extension, BPF_PROG_TYPE_EXT);
+BPF_PROG_TYPE_FNS(sk_lookup, BPF_PROG_TYPE_SK_LOOKUP);
 
 enum bpf_attach_type
 bpf_program__get_expected_attach_type(struct bpf_program *prog)
@@ -6912,6 +6913,8 @@ static const struct bpf_sec_def section_defs[] = {
                .attach_fn = attach_iter),
        BPF_EAPROG_SEC("xdp_devmap/",           BPF_PROG_TYPE_XDP,
                                                BPF_XDP_DEVMAP),
+       BPF_EAPROG_SEC("xdp_cpumap/",           BPF_PROG_TYPE_XDP,
+                                               BPF_XDP_CPUMAP),
        BPF_PROG_SEC("xdp",                     BPF_PROG_TYPE_XDP),
        BPF_PROG_SEC("perf_event",              BPF_PROG_TYPE_PERF_EVENT),
        BPF_PROG_SEC("lwt_in",                  BPF_PROG_TYPE_LWT_IN),
@@ -6979,6 +6982,8 @@ static const struct bpf_sec_def section_defs[] = {
        BPF_EAPROG_SEC("cgroup/setsockopt",     BPF_PROG_TYPE_CGROUP_SOCKOPT,
                                                BPF_CGROUP_SETSOCKOPT),
        BPF_PROG_SEC("struct_ops",              BPF_PROG_TYPE_STRUCT_OPS),
+       BPF_EAPROG_SEC("sk_lookup/",            BPF_PROG_TYPE_SK_LOOKUP,
+                                               BPF_SK_LOOKUP),
 };
 
 #undef BPF_PROG_SEC_IMPL
index 2335971..c227213 100644 (file)
@@ -350,6 +350,7 @@ LIBBPF_API int bpf_program__set_perf_event(struct bpf_program *prog);
 LIBBPF_API int bpf_program__set_tracing(struct bpf_program *prog);
 LIBBPF_API int bpf_program__set_struct_ops(struct bpf_program *prog);
 LIBBPF_API int bpf_program__set_extension(struct bpf_program *prog);
+LIBBPF_API int bpf_program__set_sk_lookup(struct bpf_program *prog);
 
 LIBBPF_API enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog);
 LIBBPF_API void bpf_program__set_type(struct bpf_program *prog,
@@ -377,6 +378,7 @@ LIBBPF_API bool bpf_program__is_perf_event(const struct bpf_program *prog);
 LIBBPF_API bool bpf_program__is_tracing(const struct bpf_program *prog);
 LIBBPF_API bool bpf_program__is_struct_ops(const struct bpf_program *prog);
 LIBBPF_API bool bpf_program__is_extension(const struct bpf_program *prog);
+LIBBPF_API bool bpf_program__is_sk_lookup(const struct bpf_program *prog);
 
 /*
  * No need for __attribute__((packed)), all members of 'bpf_map_def'
index c5d5c76..6f0856a 100644 (file)
@@ -287,6 +287,8 @@ LIBBPF_0.1.0 {
                bpf_map__type;
                bpf_map__value_size;
                bpf_program__autoload;
+               bpf_program__is_sk_lookup;
                bpf_program__set_autoload;
+               bpf_program__set_sk_lookup;
                btf__set_fd;
 } LIBBPF_0.0.9;
index 10cd8d1..5a3d3f0 100644 (file)
@@ -78,6 +78,9 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns,
        case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
                xattr.expected_attach_type = BPF_CGROUP_INET4_CONNECT;
                break;
+       case BPF_PROG_TYPE_SK_LOOKUP:
+               xattr.expected_attach_type = BPF_SK_LOOKUP;
+               break;
        case BPF_PROG_TYPE_KPROBE:
                xattr.kern_version = get_kernel_version();
                break;
index acd0871..f566556 100644 (file)
@@ -73,29 +73,8 @@ int start_server(int family, int type, const char *addr_str, __u16 port,
        socklen_t len;
        int fd;
 
-       if (family == AF_INET) {
-               struct sockaddr_in *sin = (void *)&addr;
-
-               sin->sin_family = AF_INET;
-               sin->sin_port = htons(port);
-               if (addr_str &&
-                   inet_pton(AF_INET, addr_str, &sin->sin_addr) != 1) {
-                       log_err("inet_pton(AF_INET, %s)", addr_str);
-                       return -1;
-               }
-               len = sizeof(*sin);
-       } else {
-               struct sockaddr_in6 *sin6 = (void *)&addr;
-
-               sin6->sin6_family = AF_INET6;
-               sin6->sin6_port = htons(port);
-               if (addr_str &&
-                   inet_pton(AF_INET6, addr_str, &sin6->sin6_addr) != 1) {
-                       log_err("inet_pton(AF_INET6, %s)", addr_str);
-                       return -1;
-               }
-               len = sizeof(*sin6);
-       }
+       if (make_sockaddr(family, addr_str, port, &addr, &len))
+               return -1;
 
        fd = socket(family, type, 0);
        if (fd < 0) {
@@ -194,3 +173,36 @@ int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms)
 
        return 0;
 }
+
+int make_sockaddr(int family, const char *addr_str, __u16 port,
+                 struct sockaddr_storage *addr, socklen_t *len)
+{
+       if (family == AF_INET) {
+               struct sockaddr_in *sin = (void *)addr;
+
+               sin->sin_family = AF_INET;
+               sin->sin_port = htons(port);
+               if (addr_str &&
+                   inet_pton(AF_INET, addr_str, &sin->sin_addr) != 1) {
+                       log_err("inet_pton(AF_INET, %s)", addr_str);
+                       return -1;
+               }
+               if (len)
+                       *len = sizeof(*sin);
+               return 0;
+       } else if (family == AF_INET6) {
+               struct sockaddr_in6 *sin6 = (void *)addr;
+
+               sin6->sin6_family = AF_INET6;
+               sin6->sin6_port = htons(port);
+               if (addr_str &&
+                   inet_pton(AF_INET6, addr_str, &sin6->sin6_addr) != 1) {
+                       log_err("inet_pton(AF_INET6, %s)", addr_str);
+                       return -1;
+               }
+               if (len)
+                       *len = sizeof(*sin6);
+               return 0;
+       }
+       return -1;
+}
index f580e82..c3728f6 100644 (file)
@@ -37,5 +37,7 @@ int start_server(int family, int type, const char *addr, __u16 port,
                 int timeout_ms);
 int connect_to_fd(int server_fd, int timeout_ms);
 int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms);
+int make_sockaddr(int family, const char *addr_str, __u16 port,
+                 struct sockaddr_storage *addr, socklen_t *len);
 
 #endif
index 403be6f..3b127ca 100644 (file)
@@ -6,6 +6,7 @@
 #include <bpf/libbpf.h>
 #include <linux/btf.h>
 #include <linux/kernel.h>
+#define CONFIG_DEBUG_INFO_BTF
 #include <linux/btf_ids.h>
 #include "test_progs.h"
 
@@ -27,7 +28,17 @@ struct symbol test_symbols[] = {
        { "func",    BTF_KIND_FUNC,    -1 },
 };
 
-BTF_ID_LIST(test_list)
+BTF_ID_LIST(test_list_local)
+BTF_ID_UNUSED
+BTF_ID(typedef, S)
+BTF_ID(typedef, T)
+BTF_ID(typedef, U)
+BTF_ID(struct,  S)
+BTF_ID(union,   U)
+BTF_ID(func,    func)
+
+extern __u32 test_list_global[];
+BTF_ID_LIST_GLOBAL(test_list_global)
 BTF_ID_UNUSED
 BTF_ID(typedef, S)
 BTF_ID(typedef, T)
@@ -93,18 +104,25 @@ static int resolve_symbols(void)
 
 int test_resolve_btfids(void)
 {
-       unsigned int i;
+       __u32 *test_list, *test_lists[] = { test_list_local, test_list_global };
+       unsigned int i, j;
        int ret = 0;
 
        if (resolve_symbols())
                return -1;
 
-       /* Check BTF_ID_LIST(test_list) IDs */
-       for (i = 0; i < ARRAY_SIZE(test_symbols) && !ret; i++) {
-               ret = CHECK(test_list[i] != test_symbols[i].id,
-                           "id_check",
-                           "wrong ID for %s (%d != %d)\n", test_symbols[i].name,
-                           test_list[i], test_symbols[i].id);
+       /* Check BTF_ID_LIST(test_list_local) and
+        * BTF_ID_LIST_GLOBAL(test_list_global) IDs
+        */
+       for (j = 0; j < ARRAY_SIZE(test_lists); j++) {
+               test_list = test_lists[j];
+               for (i = 0; i < ARRAY_SIZE(test_symbols) && !ret; i++) {
+                       ret = CHECK(test_list[i] != test_symbols[i].id,
+                                   "id_check",
+                                   "wrong ID for %s (%d != %d)\n",
+                                   test_symbols[i].name,
+                                   test_list[i], test_symbols[i].id);
+               }
        }
 
        return ret;
diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
new file mode 100644 (file)
index 0000000..f1784ae
--- /dev/null
@@ -0,0 +1,1282 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+// Copyright (c) 2020 Cloudflare
+/*
+ * Test BPF attach point for INET socket lookup (BPF_SK_LOOKUP).
+ *
+ * Tests exercise:
+ *  - attaching/detaching/querying programs to BPF_SK_LOOKUP hook,
+ *  - redirecting socket lookup to a socket selected by BPF program,
+ *  - failing a socket lookup on BPF program's request,
+ *  - error scenarios for selecting a socket from BPF program,
+ *  - accessing BPF program context,
+ *  - attaching and running multiple BPF programs.
+ *
+ * Tests run in a dedicated network namespace.
+ */
+
+#define _GNU_SOURCE
+#include <arpa/inet.h>
+#include <assert.h>
+#include <errno.h>
+#include <error.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+
+#include "test_progs.h"
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+#include "cgroup_helpers.h"
+#include "network_helpers.h"
+#include "test_sk_lookup.skel.h"
+
+/* External (address, port) pairs the client sends packets to. */
+#define EXT_IP4                "127.0.0.1"
+#define EXT_IP6                "fd00::1"
+#define EXT_PORT       7007
+
+/* Internal (address, port) pairs the server listens/receives at. */
+#define INT_IP4                "127.0.0.2"
+#define INT_IP4_V6     "::ffff:127.0.0.2"
+#define INT_IP6                "fd00::2"
+#define INT_PORT       8008
+
+#define IO_TIMEOUT_SEC 3
+
+enum server {
+       SERVER_A = 0,
+       SERVER_B = 1,
+       MAX_SERVERS,
+};
+
+enum {
+       PROG1 = 0,
+       PROG2,
+};
+
+struct inet_addr {
+       const char *ip;
+       unsigned short port;
+};
+
+struct test {
+       const char *desc;
+       struct bpf_program *lookup_prog;
+       struct bpf_program *reuseport_prog;
+       struct bpf_map *sock_map;
+       int sotype;
+       struct inet_addr connect_to;
+       struct inet_addr listen_at;
+       enum server accept_on;
+};
+
+static __u32 duration;         /* for CHECK macro */
+
+static bool is_ipv6(const char *ip)
+{
+       return !!strchr(ip, ':');
+}
+
+static int attach_reuseport(int sock_fd, struct bpf_program *reuseport_prog)
+{
+       int err, prog_fd;
+
+       prog_fd = bpf_program__fd(reuseport_prog);
+       if (prog_fd < 0) {
+               errno = -prog_fd;
+               return -1;
+       }
+
+       err = setsockopt(sock_fd, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF,
+                        &prog_fd, sizeof(prog_fd));
+       if (err)
+               return -1;
+
+       return 0;
+}
+
+static socklen_t inetaddr_len(const struct sockaddr_storage *addr)
+{
+       return (addr->ss_family == AF_INET ? sizeof(struct sockaddr_in) :
+               addr->ss_family == AF_INET6 ? sizeof(struct sockaddr_in6) : 0);
+}
+
+static int make_socket(int sotype, const char *ip, int port,
+                      struct sockaddr_storage *addr)
+{
+       struct timeval timeo = { .tv_sec = IO_TIMEOUT_SEC };
+       int err, family, fd;
+
+       family = is_ipv6(ip) ? AF_INET6 : AF_INET;
+       err = make_sockaddr(family, ip, port, addr, NULL);
+       if (CHECK(err, "make_address", "failed\n"))
+               return -1;
+
+       fd = socket(addr->ss_family, sotype, 0);
+       if (CHECK(fd < 0, "socket", "failed\n")) {
+               log_err("failed to make socket");
+               return -1;
+       }
+
+       err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
+       if (CHECK(err, "setsockopt(SO_SNDTIMEO)", "failed\n")) {
+               log_err("failed to set SNDTIMEO");
+               close(fd);
+               return -1;
+       }
+
+       err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
+       if (CHECK(err, "setsockopt(SO_RCVTIMEO)", "failed\n")) {
+               log_err("failed to set RCVTIMEO");
+               close(fd);
+               return -1;
+       }
+
+       return fd;
+}
+
+static int make_server(int sotype, const char *ip, int port,
+                      struct bpf_program *reuseport_prog)
+{
+       struct sockaddr_storage addr = {0};
+       const int one = 1;
+       int err, fd = -1;
+
+       fd = make_socket(sotype, ip, port, &addr);
+       if (fd < 0)
+               return -1;
+
+       /* Enabled for UDPv6 sockets for IPv4-mapped IPv6 to work. */
+       if (sotype == SOCK_DGRAM) {
+               err = setsockopt(fd, SOL_IP, IP_RECVORIGDSTADDR, &one,
+                                sizeof(one));
+               if (CHECK(err, "setsockopt(IP_RECVORIGDSTADDR)", "failed\n")) {
+                       log_err("failed to enable IP_RECVORIGDSTADDR");
+                       goto fail;
+               }
+       }
+
+       if (sotype == SOCK_DGRAM && addr.ss_family == AF_INET6) {
+               err = setsockopt(fd, SOL_IPV6, IPV6_RECVORIGDSTADDR, &one,
+                                sizeof(one));
+               if (CHECK(err, "setsockopt(IPV6_RECVORIGDSTADDR)", "failed\n")) {
+                       log_err("failed to enable IPV6_RECVORIGDSTADDR");
+                       goto fail;
+               }
+       }
+
+       if (sotype == SOCK_STREAM) {
+               err = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one,
+                                sizeof(one));
+               if (CHECK(err, "setsockopt(SO_REUSEADDR)", "failed\n")) {
+                       log_err("failed to enable SO_REUSEADDR");
+                       goto fail;
+               }
+       }
+
+       if (reuseport_prog) {
+               err = setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &one,
+                                sizeof(one));
+               if (CHECK(err, "setsockopt(SO_REUSEPORT)", "failed\n")) {
+                       log_err("failed to enable SO_REUSEPORT");
+                       goto fail;
+               }
+       }
+
+       err = bind(fd, (void *)&addr, inetaddr_len(&addr));
+       if (CHECK(err, "bind", "failed\n")) {
+               log_err("failed to bind listen socket");
+               goto fail;
+       }
+
+       if (sotype == SOCK_STREAM) {
+               err = listen(fd, SOMAXCONN);
+               if (CHECK(err, "make_server", "listen")) {
+                       log_err("failed to listen on port %d", port);
+                       goto fail;
+               }
+       }
+
+       /* Late attach reuseport prog so we can have one init path */
+       if (reuseport_prog) {
+               err = attach_reuseport(fd, reuseport_prog);
+               if (CHECK(err, "attach_reuseport", "failed\n")) {
+                       log_err("failed to attach reuseport prog");
+                       goto fail;
+               }
+       }
+
+       return fd;
+fail:
+       close(fd);
+       return -1;
+}
+
+static int make_client(int sotype, const char *ip, int port)
+{
+       struct sockaddr_storage addr = {0};
+       int err, fd;
+
+       fd = make_socket(sotype, ip, port, &addr);
+       if (fd < 0)
+               return -1;
+
+       err = connect(fd, (void *)&addr, inetaddr_len(&addr));
+       if (CHECK(err, "make_client", "connect")) {
+               log_err("failed to connect client socket");
+               goto fail;
+       }
+
+       return fd;
+fail:
+       close(fd);
+       return -1;
+}
+
+static int send_byte(int fd)
+{
+       ssize_t n;
+
+       errno = 0;
+       n = send(fd, "a", 1, 0);
+       if (CHECK(n <= 0, "send_byte", "send")) {
+               log_err("failed/partial send");
+               return -1;
+       }
+       return 0;
+}
+
+static int recv_byte(int fd)
+{
+       char buf[1];
+       ssize_t n;
+
+       n = recv(fd, buf, sizeof(buf), 0);
+       if (CHECK(n <= 0, "recv_byte", "recv")) {
+               log_err("failed/partial recv");
+               return -1;
+       }
+       return 0;
+}
+
+static int tcp_recv_send(int server_fd)
+{
+       char buf[1];
+       int ret, fd;
+       ssize_t n;
+
+       fd = accept(server_fd, NULL, NULL);
+       if (CHECK(fd < 0, "accept", "failed\n")) {
+               log_err("failed to accept");
+               return -1;
+       }
+
+       n = recv(fd, buf, sizeof(buf), 0);
+       if (CHECK(n <= 0, "recv", "failed\n")) {
+               log_err("failed/partial recv");
+               ret = -1;
+               goto close;
+       }
+
+       n = send(fd, buf, n, 0);
+       if (CHECK(n <= 0, "send", "failed\n")) {
+               log_err("failed/partial send");
+               ret = -1;
+               goto close;
+       }
+
+       ret = 0;
+close:
+       close(fd);
+       return ret;
+}
+
+static void v4_to_v6(struct sockaddr_storage *ss)
+{
+       struct sockaddr_in6 *v6 = (struct sockaddr_in6 *)ss;
+       struct sockaddr_in v4 = *(struct sockaddr_in *)ss;
+
+       v6->sin6_family = AF_INET6;
+       v6->sin6_port = v4.sin_port;
+       v6->sin6_addr.s6_addr[10] = 0xff;
+       v6->sin6_addr.s6_addr[11] = 0xff;
+       memcpy(&v6->sin6_addr.s6_addr[12], &v4.sin_addr.s_addr, 4);
+}
+
+static int udp_recv_send(int server_fd)
+{
+       char cmsg_buf[CMSG_SPACE(sizeof(struct sockaddr_storage))];
+       struct sockaddr_storage _src_addr = { 0 };
+       struct sockaddr_storage *src_addr = &_src_addr;
+       struct sockaddr_storage *dst_addr = NULL;
+       struct msghdr msg = { 0 };
+       struct iovec iov = { 0 };
+       struct cmsghdr *cm;
+       char buf[1];
+       int ret, fd;
+       ssize_t n;
+
+       iov.iov_base = buf;
+       iov.iov_len = sizeof(buf);
+
+       msg.msg_name = src_addr;
+       msg.msg_namelen = sizeof(*src_addr);
+       msg.msg_iov = &iov;
+       msg.msg_iovlen = 1;
+       msg.msg_control = cmsg_buf;
+       msg.msg_controllen = sizeof(cmsg_buf);
+
+       errno = 0;
+       n = recvmsg(server_fd, &msg, 0);
+       if (CHECK(n <= 0, "recvmsg", "failed\n")) {
+               log_err("failed to receive");
+               return -1;
+       }
+       if (CHECK(msg.msg_flags & MSG_CTRUNC, "recvmsg", "truncated cmsg\n"))
+               return -1;
+
+       for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) {
+               if ((cm->cmsg_level == SOL_IP &&
+                    cm->cmsg_type == IP_ORIGDSTADDR) ||
+                   (cm->cmsg_level == SOL_IPV6 &&
+                    cm->cmsg_type == IPV6_ORIGDSTADDR)) {
+                       dst_addr = (struct sockaddr_storage *)CMSG_DATA(cm);
+                       break;
+               }
+               log_err("warning: ignored cmsg at level %d type %d",
+                       cm->cmsg_level, cm->cmsg_type);
+       }
+       if (CHECK(!dst_addr, "recvmsg", "missing ORIGDSTADDR\n"))
+               return -1;
+
+       /* Server socket bound to IPv4-mapped IPv6 address */
+       if (src_addr->ss_family == AF_INET6 &&
+           dst_addr->ss_family == AF_INET) {
+               v4_to_v6(dst_addr);
+       }
+
+       /* Reply from original destination address. */
+       fd = socket(dst_addr->ss_family, SOCK_DGRAM, 0);
+       if (CHECK(fd < 0, "socket", "failed\n")) {
+               log_err("failed to create tx socket");
+               return -1;
+       }
+
+       ret = bind(fd, (struct sockaddr *)dst_addr, sizeof(*dst_addr));
+       if (CHECK(ret, "bind", "failed\n")) {
+               log_err("failed to bind tx socket");
+               goto out;
+       }
+
+       msg.msg_control = NULL;
+       msg.msg_controllen = 0;
+       n = sendmsg(fd, &msg, 0);
+       if (CHECK(n <= 0, "sendmsg", "failed\n")) {
+               log_err("failed to send echo reply");
+               ret = -1;
+               goto out;
+       }
+
+       ret = 0;
+out:
+       close(fd);
+       return ret;
+}
+
+static int tcp_echo_test(int client_fd, int server_fd)
+{
+       int err;
+
+       err = send_byte(client_fd);
+       if (err)
+               return -1;
+       err = tcp_recv_send(server_fd);
+       if (err)
+               return -1;
+       err = recv_byte(client_fd);
+       if (err)
+               return -1;
+
+       return 0;
+}
+
+static int udp_echo_test(int client_fd, int server_fd)
+{
+       int err;
+
+       err = send_byte(client_fd);
+       if (err)
+               return -1;
+       err = udp_recv_send(server_fd);
+       if (err)
+               return -1;
+       err = recv_byte(client_fd);
+       if (err)
+               return -1;
+
+       return 0;
+}
+
+static struct bpf_link *attach_lookup_prog(struct bpf_program *prog)
+{
+       struct bpf_link *link;
+       int net_fd;
+
+       net_fd = open("/proc/self/ns/net", O_RDONLY);
+       if (CHECK(net_fd < 0, "open", "failed\n")) {
+               log_err("failed to open /proc/self/ns/net");
+               return NULL;
+       }
+
+       link = bpf_program__attach_netns(prog, net_fd);
+       if (CHECK(IS_ERR(link), "bpf_program__attach_netns", "failed\n")) {
+               errno = -PTR_ERR(link);
+               log_err("failed to attach program '%s' to netns",
+                       bpf_program__name(prog));
+               link = NULL;
+       }
+
+       close(net_fd);
+       return link;
+}
+
+static int update_lookup_map(struct bpf_map *map, int index, int sock_fd)
+{
+       int err, map_fd;
+       uint64_t value;
+
+       map_fd = bpf_map__fd(map);
+       if (CHECK(map_fd < 0, "bpf_map__fd", "failed\n")) {
+               errno = -map_fd;
+               log_err("failed to get map FD");
+               return -1;
+       }
+
+       value = (uint64_t)sock_fd;
+       err = bpf_map_update_elem(map_fd, &index, &value, BPF_NOEXIST);
+       if (CHECK(err, "bpf_map_update_elem", "failed\n")) {
+               log_err("failed to update redir_map @ %d", index);
+               return -1;
+       }
+
+       return 0;
+}
+
+static __u32 link_info_prog_id(struct bpf_link *link)
+{
+       struct bpf_link_info info = {};
+       __u32 info_len = sizeof(info);
+       int link_fd, err;
+
+       link_fd = bpf_link__fd(link);
+       if (CHECK(link_fd < 0, "bpf_link__fd", "failed\n")) {
+               errno = -link_fd;
+               log_err("bpf_link__fd failed");
+               return 0;
+       }
+
+       err = bpf_obj_get_info_by_fd(link_fd, &info, &info_len);
+       if (CHECK(err, "bpf_obj_get_info_by_fd", "failed\n")) {
+               log_err("bpf_obj_get_info_by_fd");
+               return 0;
+       }
+       if (CHECK(info_len != sizeof(info), "bpf_obj_get_info_by_fd",
+                 "unexpected info len %u\n", info_len))
+               return 0;
+
+       return info.prog_id;
+}
+
+static void query_lookup_prog(struct test_sk_lookup *skel)
+{
+       struct bpf_link *link[3] = {};
+       __u32 attach_flags = 0;
+       __u32 prog_ids[3] = {};
+       __u32 prog_cnt = 3;
+       __u32 prog_id;
+       int net_fd;
+       int err;
+
+       net_fd = open("/proc/self/ns/net", O_RDONLY);
+       if (CHECK(net_fd < 0, "open", "failed\n")) {
+               log_err("failed to open /proc/self/ns/net");
+               return;
+       }
+
+       link[0] = attach_lookup_prog(skel->progs.lookup_pass);
+       if (!link[0])
+               goto close;
+       link[1] = attach_lookup_prog(skel->progs.lookup_pass);
+       if (!link[1])
+               goto detach;
+       link[2] = attach_lookup_prog(skel->progs.lookup_drop);
+       if (!link[2])
+               goto detach;
+
+       err = bpf_prog_query(net_fd, BPF_SK_LOOKUP, 0 /* query flags */,
+                            &attach_flags, prog_ids, &prog_cnt);
+       if (CHECK(err, "bpf_prog_query", "failed\n")) {
+               log_err("failed to query lookup prog");
+               goto detach;
+       }
+
+       errno = 0;
+       if (CHECK(attach_flags != 0, "bpf_prog_query",
+                 "wrong attach_flags on query: %u", attach_flags))
+               goto detach;
+       if (CHECK(prog_cnt != 3, "bpf_prog_query",
+                 "wrong program count on query: %u", prog_cnt))
+               goto detach;
+       prog_id = link_info_prog_id(link[0]);
+       CHECK(prog_ids[0] != prog_id, "bpf_prog_query",
+             "invalid program #0 id on query: %u != %u\n",
+             prog_ids[0], prog_id);
+       prog_id = link_info_prog_id(link[1]);
+       CHECK(prog_ids[1] != prog_id, "bpf_prog_query",
+             "invalid program #1 id on query: %u != %u\n",
+             prog_ids[1], prog_id);
+       prog_id = link_info_prog_id(link[2]);
+       CHECK(prog_ids[2] != prog_id, "bpf_prog_query",
+             "invalid program #2 id on query: %u != %u\n",
+             prog_ids[2], prog_id);
+
+detach:
+       if (link[2])
+               bpf_link__destroy(link[2]);
+       if (link[1])
+               bpf_link__destroy(link[1]);
+       if (link[0])
+               bpf_link__destroy(link[0]);
+close:
+       close(net_fd);
+}
+
+static void run_lookup_prog(const struct test *t)
+{
+       int client_fd, server_fds[MAX_SERVERS] = { -1 };
+       struct bpf_link *lookup_link;
+       int i, err;
+
+       lookup_link = attach_lookup_prog(t->lookup_prog);
+       if (!lookup_link)
+               return;
+
+       for (i = 0; i < ARRAY_SIZE(server_fds); i++) {
+               server_fds[i] = make_server(t->sotype, t->listen_at.ip,
+                                           t->listen_at.port,
+                                           t->reuseport_prog);
+               if (server_fds[i] < 0)
+                       goto close;
+
+               err = update_lookup_map(t->sock_map, i, server_fds[i]);
+               if (err)
+                       goto close;
+
+               /* want just one server for non-reuseport test */
+               if (!t->reuseport_prog)
+                       break;
+       }
+
+       client_fd = make_client(t->sotype, t->connect_to.ip, t->connect_to.port);
+       if (client_fd < 0)
+               goto close;
+
+       if (t->sotype == SOCK_STREAM)
+               tcp_echo_test(client_fd, server_fds[t->accept_on]);
+       else
+               udp_echo_test(client_fd, server_fds[t->accept_on]);
+
+       close(client_fd);
+close:
+       for (i = 0; i < ARRAY_SIZE(server_fds); i++) {
+               if (server_fds[i] != -1)
+                       close(server_fds[i]);
+       }
+       bpf_link__destroy(lookup_link);
+}
+
+static void test_redirect_lookup(struct test_sk_lookup *skel)
+{
+       const struct test tests[] = {
+               {
+                       .desc           = "TCP IPv4 redir port",
+                       .lookup_prog    = skel->progs.redir_port,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_STREAM,
+                       .connect_to     = { EXT_IP4, EXT_PORT },
+                       .listen_at      = { EXT_IP4, INT_PORT },
+               },
+               {
+                       .desc           = "TCP IPv4 redir addr",
+                       .lookup_prog    = skel->progs.redir_ip4,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_STREAM,
+                       .connect_to     = { EXT_IP4, EXT_PORT },
+                       .listen_at      = { INT_IP4, EXT_PORT },
+               },
+               {
+                       .desc           = "TCP IPv4 redir with reuseport",
+                       .lookup_prog    = skel->progs.select_sock_a,
+                       .reuseport_prog = skel->progs.select_sock_b,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_STREAM,
+                       .connect_to     = { EXT_IP4, EXT_PORT },
+                       .listen_at      = { INT_IP4, INT_PORT },
+                       .accept_on      = SERVER_B,
+               },
+               {
+                       .desc           = "TCP IPv4 redir skip reuseport",
+                       .lookup_prog    = skel->progs.select_sock_a_no_reuseport,
+                       .reuseport_prog = skel->progs.select_sock_b,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_STREAM,
+                       .connect_to     = { EXT_IP4, EXT_PORT },
+                       .listen_at      = { INT_IP4, INT_PORT },
+                       .accept_on      = SERVER_A,
+               },
+               {
+                       .desc           = "TCP IPv6 redir port",
+                       .lookup_prog    = skel->progs.redir_port,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_STREAM,
+                       .connect_to     = { EXT_IP6, EXT_PORT },
+                       .listen_at      = { EXT_IP6, INT_PORT },
+               },
+               {
+                       .desc           = "TCP IPv6 redir addr",
+                       .lookup_prog    = skel->progs.redir_ip6,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_STREAM,
+                       .connect_to     = { EXT_IP6, EXT_PORT },
+                       .listen_at      = { INT_IP6, EXT_PORT },
+               },
+               {
+                       .desc           = "TCP IPv4->IPv6 redir port",
+                       .lookup_prog    = skel->progs.redir_port,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_STREAM,
+                       .connect_to     = { EXT_IP4, EXT_PORT },
+                       .listen_at      = { INT_IP4_V6, INT_PORT },
+               },
+               {
+                       .desc           = "TCP IPv6 redir with reuseport",
+                       .lookup_prog    = skel->progs.select_sock_a,
+                       .reuseport_prog = skel->progs.select_sock_b,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_STREAM,
+                       .connect_to     = { EXT_IP6, EXT_PORT },
+                       .listen_at      = { INT_IP6, INT_PORT },
+                       .accept_on      = SERVER_B,
+               },
+               {
+                       .desc           = "TCP IPv6 redir skip reuseport",
+                       .lookup_prog    = skel->progs.select_sock_a_no_reuseport,
+                       .reuseport_prog = skel->progs.select_sock_b,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_STREAM,
+                       .connect_to     = { EXT_IP6, EXT_PORT },
+                       .listen_at      = { INT_IP6, INT_PORT },
+                       .accept_on      = SERVER_A,
+               },
+               {
+                       .desc           = "UDP IPv4 redir port",
+                       .lookup_prog    = skel->progs.redir_port,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_DGRAM,
+                       .connect_to     = { EXT_IP4, EXT_PORT },
+                       .listen_at      = { EXT_IP4, INT_PORT },
+               },
+               {
+                       .desc           = "UDP IPv4 redir addr",
+                       .lookup_prog    = skel->progs.redir_ip4,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_DGRAM,
+                       .connect_to     = { EXT_IP4, EXT_PORT },
+                       .listen_at      = { INT_IP4, EXT_PORT },
+               },
+               {
+                       .desc           = "UDP IPv4 redir with reuseport",
+                       .lookup_prog    = skel->progs.select_sock_a,
+                       .reuseport_prog = skel->progs.select_sock_b,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_DGRAM,
+                       .connect_to     = { EXT_IP4, EXT_PORT },
+                       .listen_at      = { INT_IP4, INT_PORT },
+                       .accept_on      = SERVER_B,
+               },
+               {
+                       .desc           = "UDP IPv4 redir skip reuseport",
+                       .lookup_prog    = skel->progs.select_sock_a_no_reuseport,
+                       .reuseport_prog = skel->progs.select_sock_b,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_DGRAM,
+                       .connect_to     = { EXT_IP4, EXT_PORT },
+                       .listen_at      = { INT_IP4, INT_PORT },
+                       .accept_on      = SERVER_A,
+               },
+               {
+                       .desc           = "UDP IPv6 redir port",
+                       .lookup_prog    = skel->progs.redir_port,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_DGRAM,
+                       .connect_to     = { EXT_IP6, EXT_PORT },
+                       .listen_at      = { EXT_IP6, INT_PORT },
+               },
+               {
+                       .desc           = "UDP IPv6 redir addr",
+                       .lookup_prog    = skel->progs.redir_ip6,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_DGRAM,
+                       .connect_to     = { EXT_IP6, EXT_PORT },
+                       .listen_at      = { INT_IP6, EXT_PORT },
+               },
+               {
+                       .desc           = "UDP IPv4->IPv6 redir port",
+                       .lookup_prog    = skel->progs.redir_port,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_DGRAM,
+                       .listen_at      = { INT_IP4_V6, INT_PORT },
+                       .connect_to     = { EXT_IP4, EXT_PORT },
+               },
+               {
+                       .desc           = "UDP IPv6 redir and reuseport",
+                       .lookup_prog    = skel->progs.select_sock_a,
+                       .reuseport_prog = skel->progs.select_sock_b,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_DGRAM,
+                       .connect_to     = { EXT_IP6, EXT_PORT },
+                       .listen_at      = { INT_IP6, INT_PORT },
+                       .accept_on      = SERVER_B,
+               },
+               {
+                       .desc           = "UDP IPv6 redir skip reuseport",
+                       .lookup_prog    = skel->progs.select_sock_a_no_reuseport,
+                       .reuseport_prog = skel->progs.select_sock_b,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_DGRAM,
+                       .connect_to     = { EXT_IP6, EXT_PORT },
+                       .listen_at      = { INT_IP6, INT_PORT },
+                       .accept_on      = SERVER_A,
+               },
+       };
+       const struct test *t;
+
+       for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
+               if (test__start_subtest(t->desc))
+                       run_lookup_prog(t);
+       }
+}
+
+static void drop_on_lookup(const struct test *t)
+{
+       struct sockaddr_storage dst = {};
+       int client_fd, server_fd, err;
+       struct bpf_link *lookup_link;
+       ssize_t n;
+
+       lookup_link = attach_lookup_prog(t->lookup_prog);
+       if (!lookup_link)
+               return;
+
+       server_fd = make_server(t->sotype, t->listen_at.ip, t->listen_at.port,
+                               t->reuseport_prog);
+       if (server_fd < 0)
+               goto detach;
+
+       client_fd = make_socket(t->sotype, t->connect_to.ip,
+                               t->connect_to.port, &dst);
+       if (client_fd < 0)
+               goto close_srv;
+
+       err = connect(client_fd, (void *)&dst, inetaddr_len(&dst));
+       if (t->sotype == SOCK_DGRAM) {
+               err = send_byte(client_fd);
+               if (err)
+                       goto close_all;
+
+               /* Read out asynchronous error */
+               n = recv(client_fd, NULL, 0, 0);
+               err = n == -1;
+       }
+       if (CHECK(!err || errno != ECONNREFUSED, "connect",
+                 "unexpected success or error\n"))
+               log_err("expected ECONNREFUSED on connect");
+
+close_all:
+       close(client_fd);
+close_srv:
+       close(server_fd);
+detach:
+       bpf_link__destroy(lookup_link);
+}
+
+static void test_drop_on_lookup(struct test_sk_lookup *skel)
+{
+       const struct test tests[] = {
+               {
+                       .desc           = "TCP IPv4 drop on lookup",
+                       .lookup_prog    = skel->progs.lookup_drop,
+                       .sotype         = SOCK_STREAM,
+                       .connect_to     = { EXT_IP4, EXT_PORT },
+                       .listen_at      = { EXT_IP4, EXT_PORT },
+               },
+               {
+                       .desc           = "TCP IPv6 drop on lookup",
+                       .lookup_prog    = skel->progs.lookup_drop,
+                       .sotype         = SOCK_STREAM,
+                       .connect_to     = { EXT_IP6, EXT_PORT },
+                       .listen_at      = { EXT_IP6, EXT_PORT },
+               },
+               {
+                       .desc           = "UDP IPv4 drop on lookup",
+                       .lookup_prog    = skel->progs.lookup_drop,
+                       .sotype         = SOCK_DGRAM,
+                       .connect_to     = { EXT_IP4, EXT_PORT },
+                       .listen_at      = { EXT_IP4, EXT_PORT },
+               },
+               {
+                       .desc           = "UDP IPv6 drop on lookup",
+                       .lookup_prog    = skel->progs.lookup_drop,
+                       .sotype         = SOCK_DGRAM,
+                       .connect_to     = { EXT_IP6, EXT_PORT },
+                       .listen_at      = { EXT_IP6, INT_PORT },
+               },
+       };
+       const struct test *t;
+
+       for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
+               if (test__start_subtest(t->desc))
+                       drop_on_lookup(t);
+       }
+}
+
+static void drop_on_reuseport(const struct test *t)
+{
+       struct sockaddr_storage dst = { 0 };
+       int client, server1, server2, err;
+       struct bpf_link *lookup_link;
+       ssize_t n;
+
+       lookup_link = attach_lookup_prog(t->lookup_prog);
+       if (!lookup_link)
+               return;
+
+       server1 = make_server(t->sotype, t->listen_at.ip, t->listen_at.port,
+                             t->reuseport_prog);
+       if (server1 < 0)
+               goto detach;
+
+       err = update_lookup_map(t->sock_map, SERVER_A, server1);
+       if (err)
+               goto detach;
+
+       /* second server on destination address we should never reach */
+       server2 = make_server(t->sotype, t->connect_to.ip, t->connect_to.port,
+                             NULL /* reuseport prog */);
+       if (server2 < 0)
+               goto close_srv1;
+
+       client = make_socket(t->sotype, t->connect_to.ip,
+                            t->connect_to.port, &dst);
+       if (client < 0)
+               goto close_srv2;
+
+       err = connect(client, (void *)&dst, inetaddr_len(&dst));
+       if (t->sotype == SOCK_DGRAM) {
+               err = send_byte(client);
+               if (err)
+                       goto close_all;
+
+               /* Read out asynchronous error */
+               n = recv(client, NULL, 0, 0);
+               err = n == -1;
+       }
+       if (CHECK(!err || errno != ECONNREFUSED, "connect",
+                 "unexpected success or error\n"))
+               log_err("expected ECONNREFUSED on connect");
+
+close_all:
+       close(client);
+close_srv2:
+       close(server2);
+close_srv1:
+       close(server1);
+detach:
+       bpf_link__destroy(lookup_link);
+}
+
+static void test_drop_on_reuseport(struct test_sk_lookup *skel)
+{
+       const struct test tests[] = {
+               {
+                       .desc           = "TCP IPv4 drop on reuseport",
+                       .lookup_prog    = skel->progs.select_sock_a,
+                       .reuseport_prog = skel->progs.reuseport_drop,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_STREAM,
+                       .connect_to     = { EXT_IP4, EXT_PORT },
+                       .listen_at      = { INT_IP4, INT_PORT },
+               },
+               {
+                       .desc           = "TCP IPv6 drop on reuseport",
+                       .lookup_prog    = skel->progs.select_sock_a,
+                       .reuseport_prog = skel->progs.reuseport_drop,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_STREAM,
+                       .connect_to     = { EXT_IP6, EXT_PORT },
+                       .listen_at      = { INT_IP6, INT_PORT },
+               },
+               {
+                       .desc           = "UDP IPv4 drop on reuseport",
+                       .lookup_prog    = skel->progs.select_sock_a,
+                       .reuseport_prog = skel->progs.reuseport_drop,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_DGRAM,
+                       .connect_to     = { EXT_IP4, EXT_PORT },
+                       .listen_at      = { INT_IP4, INT_PORT },
+               },
+               {
+                       .desc           = "TCP IPv6 drop on reuseport",
+                       .lookup_prog    = skel->progs.select_sock_a,
+                       .reuseport_prog = skel->progs.reuseport_drop,
+                       .sock_map       = skel->maps.redir_map,
+                       .sotype         = SOCK_STREAM,
+                       .connect_to     = { EXT_IP6, EXT_PORT },
+                       .listen_at      = { INT_IP6, INT_PORT },
+               },
+       };
+       const struct test *t;
+
+       for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
+               if (test__start_subtest(t->desc))
+                       drop_on_reuseport(t);
+       }
+}
+
+static void run_sk_assign(struct test_sk_lookup *skel,
+                         struct bpf_program *lookup_prog,
+                         const char *listen_ip, const char *connect_ip)
+{
+       int client_fd, peer_fd, server_fds[MAX_SERVERS] = { -1 };
+       struct bpf_link *lookup_link;
+       int i, err;
+
+       lookup_link = attach_lookup_prog(lookup_prog);
+       if (!lookup_link)
+               return;
+
+       for (i = 0; i < ARRAY_SIZE(server_fds); i++) {
+               server_fds[i] = make_server(SOCK_STREAM, listen_ip, 0, NULL);
+               if (server_fds[i] < 0)
+                       goto close_servers;
+
+               err = update_lookup_map(skel->maps.redir_map, i,
+                                       server_fds[i]);
+               if (err)
+                       goto close_servers;
+       }
+
+       client_fd = make_client(SOCK_STREAM, connect_ip, EXT_PORT);
+       if (client_fd < 0)
+               goto close_servers;
+
+       peer_fd = accept(server_fds[SERVER_B], NULL, NULL);
+       if (CHECK(peer_fd < 0, "accept", "failed\n"))
+               goto close_client;
+
+       close(peer_fd);
+close_client:
+       close(client_fd);
+close_servers:
+       for (i = 0; i < ARRAY_SIZE(server_fds); i++) {
+               if (server_fds[i] != -1)
+                       close(server_fds[i]);
+       }
+       bpf_link__destroy(lookup_link);
+}
+
+static void run_sk_assign_v4(struct test_sk_lookup *skel,
+                            struct bpf_program *lookup_prog)
+{
+       run_sk_assign(skel, lookup_prog, INT_IP4, EXT_IP4);
+}
+
+static void run_sk_assign_v6(struct test_sk_lookup *skel,
+                            struct bpf_program *lookup_prog)
+{
+       run_sk_assign(skel, lookup_prog, INT_IP6, EXT_IP6);
+}
+
+static void run_sk_assign_connected(struct test_sk_lookup *skel,
+                                   int sotype)
+{
+       int err, client_fd, connected_fd, server_fd;
+       struct bpf_link *lookup_link;
+
+       server_fd = make_server(sotype, EXT_IP4, EXT_PORT, NULL);
+       if (server_fd < 0)
+               return;
+
+       connected_fd = make_client(sotype, EXT_IP4, EXT_PORT);
+       if (connected_fd < 0)
+               goto out_close_server;
+
+       /* Put a connected socket in redirect map */
+       err = update_lookup_map(skel->maps.redir_map, SERVER_A, connected_fd);
+       if (err)
+               goto out_close_connected;
+
+       lookup_link = attach_lookup_prog(skel->progs.sk_assign_esocknosupport);
+       if (!lookup_link)
+               goto out_close_connected;
+
+       /* Try to redirect TCP SYN / UDP packet to a connected socket */
+       client_fd = make_client(sotype, EXT_IP4, EXT_PORT);
+       if (client_fd < 0)
+               goto out_unlink_prog;
+       if (sotype == SOCK_DGRAM) {
+               send_byte(client_fd);
+               recv_byte(server_fd);
+       }
+
+       close(client_fd);
+out_unlink_prog:
+       bpf_link__destroy(lookup_link);
+out_close_connected:
+       close(connected_fd);
+out_close_server:
+       close(server_fd);
+}
+
+static void test_sk_assign_helper(struct test_sk_lookup *skel)
+{
+       if (test__start_subtest("sk_assign returns EEXIST"))
+               run_sk_assign_v4(skel, skel->progs.sk_assign_eexist);
+       if (test__start_subtest("sk_assign honors F_REPLACE"))
+               run_sk_assign_v4(skel, skel->progs.sk_assign_replace_flag);
+       if (test__start_subtest("sk_assign accepts NULL socket"))
+               run_sk_assign_v4(skel, skel->progs.sk_assign_null);
+       if (test__start_subtest("access ctx->sk"))
+               run_sk_assign_v4(skel, skel->progs.access_ctx_sk);
+       if (test__start_subtest("narrow access to ctx v4"))
+               run_sk_assign_v4(skel, skel->progs.ctx_narrow_access);
+       if (test__start_subtest("narrow access to ctx v6"))
+               run_sk_assign_v6(skel, skel->progs.ctx_narrow_access);
+       if (test__start_subtest("sk_assign rejects TCP established"))
+               run_sk_assign_connected(skel, SOCK_STREAM);
+       if (test__start_subtest("sk_assign rejects UDP connected"))
+               run_sk_assign_connected(skel, SOCK_DGRAM);
+}
+
+struct test_multi_prog {
+       const char *desc;
+       struct bpf_program *prog1;
+       struct bpf_program *prog2;
+       struct bpf_map *redir_map;
+       struct bpf_map *run_map;
+       int expect_errno;
+       struct inet_addr listen_at;
+};
+
+static void run_multi_prog_lookup(const struct test_multi_prog *t)
+{
+       struct sockaddr_storage dst = {};
+       int map_fd, server_fd, client_fd;
+       struct bpf_link *link1, *link2;
+       int prog_idx, done, err;
+
+       map_fd = bpf_map__fd(t->run_map);
+
+       done = 0;
+       prog_idx = PROG1;
+       err = bpf_map_update_elem(map_fd, &prog_idx, &done, BPF_ANY);
+       if (CHECK(err, "bpf_map_update_elem", "failed\n"))
+               return;
+       prog_idx = PROG2;
+       err = bpf_map_update_elem(map_fd, &prog_idx, &done, BPF_ANY);
+       if (CHECK(err, "bpf_map_update_elem", "failed\n"))
+               return;
+
+       link1 = attach_lookup_prog(t->prog1);
+       if (!link1)
+               return;
+       link2 = attach_lookup_prog(t->prog2);
+       if (!link2)
+               goto out_unlink1;
+
+       server_fd = make_server(SOCK_STREAM, t->listen_at.ip,
+                               t->listen_at.port, NULL);
+       if (server_fd < 0)
+               goto out_unlink2;
+
+       err = update_lookup_map(t->redir_map, SERVER_A, server_fd);
+       if (err)
+               goto out_close_server;
+
+       client_fd = make_socket(SOCK_STREAM, EXT_IP4, EXT_PORT, &dst);
+       if (client_fd < 0)
+               goto out_close_server;
+
+       err = connect(client_fd, (void *)&dst, inetaddr_len(&dst));
+       if (CHECK(err && !t->expect_errno, "connect",
+                 "unexpected error %d\n", errno))
+               goto out_close_client;
+       if (CHECK(err && t->expect_errno && errno != t->expect_errno,
+                 "connect", "unexpected error %d\n", errno))
+               goto out_close_client;
+
+       done = 0;
+       prog_idx = PROG1;
+       err = bpf_map_lookup_elem(map_fd, &prog_idx, &done);
+       CHECK(err, "bpf_map_lookup_elem", "failed\n");
+       CHECK(!done, "bpf_map_lookup_elem", "PROG1 !done\n");
+
+       done = 0;
+       prog_idx = PROG2;
+       err = bpf_map_lookup_elem(map_fd, &prog_idx, &done);
+       CHECK(err, "bpf_map_lookup_elem", "failed\n");
+       CHECK(!done, "bpf_map_lookup_elem", "PROG2 !done\n");
+
+out_close_client:
+       close(client_fd);
+out_close_server:
+       close(server_fd);
+out_unlink2:
+       bpf_link__destroy(link2);
+out_unlink1:
+       bpf_link__destroy(link1);
+}
+
+static void test_multi_prog_lookup(struct test_sk_lookup *skel)
+{
+       struct test_multi_prog tests[] = {
+               {
+                       .desc           = "multi prog - pass, pass",
+                       .prog1          = skel->progs.multi_prog_pass1,
+                       .prog2          = skel->progs.multi_prog_pass2,
+                       .listen_at      = { EXT_IP4, EXT_PORT },
+               },
+               {
+                       .desc           = "multi prog - drop, drop",
+                       .prog1          = skel->progs.multi_prog_drop1,
+                       .prog2          = skel->progs.multi_prog_drop2,
+                       .listen_at      = { EXT_IP4, EXT_PORT },
+                       .expect_errno   = ECONNREFUSED,
+               },
+               {
+                       .desc           = "multi prog - pass, drop",
+                       .prog1          = skel->progs.multi_prog_pass1,
+                       .prog2          = skel->progs.multi_prog_drop2,
+                       .listen_at      = { EXT_IP4, EXT_PORT },
+                       .expect_errno   = ECONNREFUSED,
+               },
+               {
+                       .desc           = "multi prog - drop, pass",
+                       .prog1          = skel->progs.multi_prog_drop1,
+                       .prog2          = skel->progs.multi_prog_pass2,
+                       .listen_at      = { EXT_IP4, EXT_PORT },
+                       .expect_errno   = ECONNREFUSED,
+               },
+               {
+                       .desc           = "multi prog - pass, redir",
+                       .prog1          = skel->progs.multi_prog_pass1,
+                       .prog2          = skel->progs.multi_prog_redir2,
+                       .listen_at      = { INT_IP4, INT_PORT },
+               },
+               {
+                       .desc           = "multi prog - redir, pass",
+                       .prog1          = skel->progs.multi_prog_redir1,
+                       .prog2          = skel->progs.multi_prog_pass2,
+                       .listen_at      = { INT_IP4, INT_PORT },
+               },
+               {
+                       .desc           = "multi prog - drop, redir",
+                       .prog1          = skel->progs.multi_prog_drop1,
+                       .prog2          = skel->progs.multi_prog_redir2,
+                       .listen_at      = { INT_IP4, INT_PORT },
+               },
+               {
+                       .desc           = "multi prog - redir, drop",
+                       .prog1          = skel->progs.multi_prog_redir1,
+                       .prog2          = skel->progs.multi_prog_drop2,
+                       .listen_at      = { INT_IP4, INT_PORT },
+               },
+               {
+                       .desc           = "multi prog - redir, redir",
+                       .prog1          = skel->progs.multi_prog_redir1,
+                       .prog2          = skel->progs.multi_prog_redir2,
+                       .listen_at      = { INT_IP4, INT_PORT },
+               },
+       };
+       struct test_multi_prog *t;
+
+       for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
+               t->redir_map = skel->maps.redir_map;
+               t->run_map = skel->maps.run_map;
+               if (test__start_subtest(t->desc))
+                       run_multi_prog_lookup(t);
+       }
+}
+
+static void run_tests(struct test_sk_lookup *skel)
+{
+       if (test__start_subtest("query lookup prog"))
+               query_lookup_prog(skel);
+       test_redirect_lookup(skel);
+       test_drop_on_lookup(skel);
+       test_drop_on_reuseport(skel);
+       test_sk_assign_helper(skel);
+       test_multi_prog_lookup(skel);
+}
+
+static int switch_netns(void)
+{
+       static const char * const setup_script[] = {
+               "ip -6 addr add dev lo " EXT_IP6 "/128 nodad",
+               "ip -6 addr add dev lo " INT_IP6 "/128 nodad",
+               "ip link set dev lo up",
+               NULL,
+       };
+       const char * const *cmd;
+       int err;
+
+       err = unshare(CLONE_NEWNET);
+       if (CHECK(err, "unshare", "failed\n")) {
+               log_err("unshare(CLONE_NEWNET)");
+               return -1;
+       }
+
+       for (cmd = setup_script; *cmd; cmd++) {
+               err = system(*cmd);
+               if (CHECK(err, "system", "failed\n")) {
+                       log_err("system(%s)", *cmd);
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+void test_sk_lookup(void)
+{
+       struct test_sk_lookup *skel;
+       int err;
+
+       err = switch_netns();
+       if (err)
+               return;
+
+       skel = test_sk_lookup__open_and_load();
+       if (CHECK(!skel, "skel open_and_load", "failed\n"))
+               return;
+
+       run_tests(skel);
+
+       test_sk_lookup__destroy(skel);
+}
index 8547ecb..ec281b0 100644 (file)
@@ -193,11 +193,10 @@ static void run_test(int cgroup_fd)
        if (CHECK_FAIL(server_fd < 0))
                goto close_bpf_object;
 
+       pthread_mutex_lock(&server_started_mtx);
        if (CHECK_FAIL(pthread_create(&tid, NULL, server_thread,
                                      (void *)&server_fd)))
                goto close_server_fd;
-
-       pthread_mutex_lock(&server_started_mtx);
        pthread_cond_wait(&server_started, &server_started_mtx);
        pthread_mutex_unlock(&server_started_mtx);
 
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c
new file mode 100644 (file)
index 0000000..0176573
--- /dev/null
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <uapi/linux/bpf.h>
+#include <linux/if_link.h>
+#include <test_progs.h>
+
+#include "test_xdp_with_cpumap_helpers.skel.h"
+
+#define IFINDEX_LO     1
+
+void test_xdp_with_cpumap_helpers(void)
+{
+       struct test_xdp_with_cpumap_helpers *skel;
+       struct bpf_prog_info info = {};
+       struct bpf_cpumap_val val = {
+               .qsize = 192,
+       };
+       __u32 duration = 0, idx = 0;
+       __u32 len = sizeof(info);
+       int err, prog_fd, map_fd;
+
+       skel = test_xdp_with_cpumap_helpers__open_and_load();
+       if (CHECK_FAIL(!skel)) {
+               perror("test_xdp_with_cpumap_helpers__open_and_load");
+               return;
+       }
+
+       /* can not attach program with cpumaps that allow programs
+        * as xdp generic
+        */
+       prog_fd = bpf_program__fd(skel->progs.xdp_redir_prog);
+       err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE);
+       CHECK(err == 0, "Generic attach of program with 8-byte CPUMAP",
+             "should have failed\n");
+
+       prog_fd = bpf_program__fd(skel->progs.xdp_dummy_cm);
+       map_fd = bpf_map__fd(skel->maps.cpu_map);
+       err = bpf_obj_get_info_by_fd(prog_fd, &info, &len);
+       if (CHECK_FAIL(err))
+               goto out_close;
+
+       val.bpf_prog.fd = prog_fd;
+       err = bpf_map_update_elem(map_fd, &idx, &val, 0);
+       CHECK(err, "Add program to cpumap entry", "err %d errno %d\n",
+             err, errno);
+
+       err = bpf_map_lookup_elem(map_fd, &idx, &val);
+       CHECK(err, "Read cpumap entry", "err %d errno %d\n", err, errno);
+       CHECK(info.id != val.bpf_prog.id, "Expected program id in cpumap entry",
+             "expected %u read %u\n", info.id, val.bpf_prog.id);
+
+       /* can not attach BPF_XDP_CPUMAP program to a device */
+       err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE);
+       CHECK(err == 0, "Attach of BPF_XDP_CPUMAP program",
+             "should have failed\n");
+
+       val.qsize = 192;
+       val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog);
+       err = bpf_map_update_elem(map_fd, &idx, &val, 0);
+       CHECK(err == 0, "Add non-BPF_XDP_CPUMAP program to cpumap entry",
+             "should have failed\n");
+
+out_close:
+       test_xdp_with_cpumap_helpers__destroy(skel);
+}
+
+void test_xdp_cpumap_attach(void)
+{
+       if (test__start_subtest("cpumap_with_progs"))
+               test_xdp_with_cpumap_helpers();
+}
index 7de98a6..95989f4 100644 (file)
@@ -36,10 +36,10 @@ int dump_netlink(struct bpf_iter__netlink *ctx)
        if (!nlk->groups)  {
                group = 0;
        } else {
-               /* FIXME: temporary use bpf_probe_read here, needs
+               /* FIXME: temporary use bpf_probe_read_kernel here, needs
                 * verifier support to do direct access.
                 */
-               bpf_probe_read(&group, sizeof(group), &nlk->groups[0]);
+               bpf_probe_read_kernel(&group, sizeof(group), &nlk->groups[0]);
        }
        BPF_SEQ_PRINTF(seq, "%-10u %08x %-8d %-8d %-5d %-8d ",
                       nlk->portid, (u32)group,
@@ -56,7 +56,7 @@ int dump_netlink(struct bpf_iter__netlink *ctx)
                 * with current verifier.
                 */
                inode = SOCK_INODE(sk);
-               bpf_probe_read(&ino, sizeof(ino), &inode->i_ino);
+               bpf_probe_read_kernel(&ino, sizeof(ino), &inode->i_ino);
        }
        BPF_SEQ_PRINTF(seq, "%-8u %-8lu\n", s->sk_drops.counter, ino);
 
index 30fd587..54380c5 100644 (file)
@@ -57,7 +57,7 @@ static long sock_i_ino(const struct sock *sk)
                return 0;
 
        inode = &container_of(sk_socket, struct socket_alloc, socket)->vfs_inode;
-       bpf_probe_read(&ino, sizeof(ino), &inode->i_ino);
+       bpf_probe_read_kernel(&ino, sizeof(ino), &inode->i_ino);
        return ino;
 }
 
index 10dec43..b4fbddf 100644 (file)
@@ -57,7 +57,7 @@ static long sock_i_ino(const struct sock *sk)
                return 0;
 
        inode = &container_of(sk_socket, struct socket_alloc, socket)->vfs_inode;
-       bpf_probe_read(&ino, sizeof(ino), &inode->i_ino);
+       bpf_probe_read_kernel(&ino, sizeof(ino), &inode->i_ino);
        return ino;
 }
 
index 7053784..f258583 100644 (file)
@@ -18,7 +18,7 @@ static long sock_i_ino(const struct sock *sk)
                return 0;
 
        inode = &container_of(sk_socket, struct socket_alloc, socket)->vfs_inode;
-       bpf_probe_read(&ino, sizeof(ino), &inode->i_ino);
+       bpf_probe_read_kernel(&ino, sizeof(ino), &inode->i_ino);
        return ino;
 }
 
index c1175a6..65f93bb 100644 (file)
@@ -25,7 +25,7 @@ static long sock_i_ino(const struct sock *sk)
                return 0;
 
        inode = &container_of(sk_socket, struct socket_alloc, socket)->vfs_inode;
-       bpf_probe_read(&ino, sizeof(ino), &inode->i_ino);
+       bpf_probe_read_kernel(&ino, sizeof(ino), &inode->i_ino);
        return ino;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup.c b/tools/testing/selftests/bpf/progs/test_sk_lookup.c
new file mode 100644 (file)
index 0000000..bbf8296
--- /dev/null
@@ -0,0 +1,641 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+// Copyright (c) 2020 Cloudflare
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <sys/socket.h>
+
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_helpers.h>
+
+#define IP4(a, b, c, d)                                        \
+       bpf_htonl((((__u32)(a) & 0xffU) << 24) |        \
+                 (((__u32)(b) & 0xffU) << 16) |        \
+                 (((__u32)(c) & 0xffU) <<  8) |        \
+                 (((__u32)(d) & 0xffU) <<  0))
+#define IP6(aaaa, bbbb, cccc, dddd)                    \
+       { bpf_htonl(aaaa), bpf_htonl(bbbb), bpf_htonl(cccc), bpf_htonl(dddd) }
+
+#define MAX_SOCKS 32
+
+struct {
+       __uint(type, BPF_MAP_TYPE_SOCKMAP);
+       __uint(max_entries, MAX_SOCKS);
+       __type(key, __u32);
+       __type(value, __u64);
+} redir_map SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __uint(max_entries, 2);
+       __type(key, int);
+       __type(value, int);
+} run_map SEC(".maps");
+
+enum {
+       PROG1 = 0,
+       PROG2,
+};
+
+enum {
+       SERVER_A = 0,
+       SERVER_B,
+};
+
+/* Addressable key/value constants for convenience */
+static const int KEY_PROG1 = PROG1;
+static const int KEY_PROG2 = PROG2;
+static const int PROG_DONE = 1;
+
+static const __u32 KEY_SERVER_A = SERVER_A;
+static const __u32 KEY_SERVER_B = SERVER_B;
+
+static const __u16 DST_PORT = 7007; /* Host byte order */
+static const __u32 DST_IP4 = IP4(127, 0, 0, 1);
+static const __u32 DST_IP6[] = IP6(0xfd000000, 0x0, 0x0, 0x00000001);
+
+SEC("sk_lookup/lookup_pass")
+int lookup_pass(struct bpf_sk_lookup *ctx)
+{
+       return SK_PASS;
+}
+
+SEC("sk_lookup/lookup_drop")
+int lookup_drop(struct bpf_sk_lookup *ctx)
+{
+       return SK_DROP;
+}
+
+SEC("sk_reuseport/reuse_pass")
+int reuseport_pass(struct sk_reuseport_md *ctx)
+{
+       return SK_PASS;
+}
+
+SEC("sk_reuseport/reuse_drop")
+int reuseport_drop(struct sk_reuseport_md *ctx)
+{
+       return SK_DROP;
+}
+
+/* Redirect packets destined for port DST_PORT to socket at redir_map[0]. */
+SEC("sk_lookup/redir_port")
+int redir_port(struct bpf_sk_lookup *ctx)
+{
+       struct bpf_sock *sk;
+       int err;
+
+       if (ctx->local_port != DST_PORT)
+               return SK_PASS;
+
+       sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+       if (!sk)
+               return SK_PASS;
+
+       err = bpf_sk_assign(ctx, sk, 0);
+       bpf_sk_release(sk);
+       return err ? SK_DROP : SK_PASS;
+}
+
+/* Redirect packets destined for DST_IP4 address to socket at redir_map[0]. */
+SEC("sk_lookup/redir_ip4")
+int redir_ip4(struct bpf_sk_lookup *ctx)
+{
+       struct bpf_sock *sk;
+       int err;
+
+       if (ctx->family != AF_INET)
+               return SK_PASS;
+       if (ctx->local_port != DST_PORT)
+               return SK_PASS;
+       if (ctx->local_ip4 != DST_IP4)
+               return SK_PASS;
+
+       sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+       if (!sk)
+               return SK_PASS;
+
+       err = bpf_sk_assign(ctx, sk, 0);
+       bpf_sk_release(sk);
+       return err ? SK_DROP : SK_PASS;
+}
+
+/* Redirect packets destined for DST_IP6 address to socket at redir_map[0]. */
+SEC("sk_lookup/redir_ip6")
+int redir_ip6(struct bpf_sk_lookup *ctx)
+{
+       struct bpf_sock *sk;
+       int err;
+
+       if (ctx->family != AF_INET6)
+               return SK_PASS;
+       if (ctx->local_port != DST_PORT)
+               return SK_PASS;
+       if (ctx->local_ip6[0] != DST_IP6[0] ||
+           ctx->local_ip6[1] != DST_IP6[1] ||
+           ctx->local_ip6[2] != DST_IP6[2] ||
+           ctx->local_ip6[3] != DST_IP6[3])
+               return SK_PASS;
+
+       sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+       if (!sk)
+               return SK_PASS;
+
+       err = bpf_sk_assign(ctx, sk, 0);
+       bpf_sk_release(sk);
+       return err ? SK_DROP : SK_PASS;
+}
+
+SEC("sk_lookup/select_sock_a")
+int select_sock_a(struct bpf_sk_lookup *ctx)
+{
+       struct bpf_sock *sk;
+       int err;
+
+       sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+       if (!sk)
+               return SK_PASS;
+
+       err = bpf_sk_assign(ctx, sk, 0);
+       bpf_sk_release(sk);
+       return err ? SK_DROP : SK_PASS;
+}
+
+SEC("sk_lookup/select_sock_a_no_reuseport")
+int select_sock_a_no_reuseport(struct bpf_sk_lookup *ctx)
+{
+       struct bpf_sock *sk;
+       int err;
+
+       sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+       if (!sk)
+               return SK_DROP;
+
+       err = bpf_sk_assign(ctx, sk, BPF_SK_LOOKUP_F_NO_REUSEPORT);
+       bpf_sk_release(sk);
+       return err ? SK_DROP : SK_PASS;
+}
+
+SEC("sk_reuseport/select_sock_b")
+int select_sock_b(struct sk_reuseport_md *ctx)
+{
+       __u32 key = KEY_SERVER_B;
+       int err;
+
+       err = bpf_sk_select_reuseport(ctx, &redir_map, &key, 0);
+       return err ? SK_DROP : SK_PASS;
+}
+
+/* Check that bpf_sk_assign() returns -EEXIST if socket already selected. */
+SEC("sk_lookup/sk_assign_eexist")
+int sk_assign_eexist(struct bpf_sk_lookup *ctx)
+{
+       struct bpf_sock *sk;
+       int err, ret;
+
+       ret = SK_DROP;
+       sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_B);
+       if (!sk)
+               goto out;
+       err = bpf_sk_assign(ctx, sk, 0);
+       if (err)
+               goto out;
+       bpf_sk_release(sk);
+
+       sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+       if (!sk)
+               goto out;
+       err = bpf_sk_assign(ctx, sk, 0);
+       if (err != -EEXIST) {
+               bpf_printk("sk_assign returned %d, expected %d\n",
+                          err, -EEXIST);
+               goto out;
+       }
+
+       ret = SK_PASS; /* Success, redirect to KEY_SERVER_B */
+out:
+       if (sk)
+               bpf_sk_release(sk);
+       return ret;
+}
+
+/* Check that bpf_sk_assign(BPF_SK_LOOKUP_F_REPLACE) can override selection. */
+SEC("sk_lookup/sk_assign_replace_flag")
+int sk_assign_replace_flag(struct bpf_sk_lookup *ctx)
+{
+       struct bpf_sock *sk;
+       int err, ret;
+
+       ret = SK_DROP;
+       sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+       if (!sk)
+               goto out;
+       err = bpf_sk_assign(ctx, sk, 0);
+       if (err)
+               goto out;
+       bpf_sk_release(sk);
+
+       sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_B);
+       if (!sk)
+               goto out;
+       err = bpf_sk_assign(ctx, sk, BPF_SK_LOOKUP_F_REPLACE);
+       if (err) {
+               bpf_printk("sk_assign returned %d, expected 0\n", err);
+               goto out;
+       }
+
+       ret = SK_PASS; /* Success, redirect to KEY_SERVER_B */
+out:
+       if (sk)
+               bpf_sk_release(sk);
+       return ret;
+}
+
+/* Check that bpf_sk_assign(sk=NULL) is accepted. */
+SEC("sk_lookup/sk_assign_null")
+int sk_assign_null(struct bpf_sk_lookup *ctx)
+{
+       struct bpf_sock *sk = NULL;
+       int err, ret;
+
+       ret = SK_DROP;
+
+       err = bpf_sk_assign(ctx, NULL, 0);
+       if (err) {
+               bpf_printk("sk_assign returned %d, expected 0\n", err);
+               goto out;
+       }
+
+       sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_B);
+       if (!sk)
+               goto out;
+       err = bpf_sk_assign(ctx, sk, BPF_SK_LOOKUP_F_REPLACE);
+       if (err) {
+               bpf_printk("sk_assign returned %d, expected 0\n", err);
+               goto out;
+       }
+
+       if (ctx->sk != sk)
+               goto out;
+       err = bpf_sk_assign(ctx, NULL, 0);
+       if (err != -EEXIST)
+               goto out;
+       err = bpf_sk_assign(ctx, NULL, BPF_SK_LOOKUP_F_REPLACE);
+       if (err)
+               goto out;
+       err = bpf_sk_assign(ctx, sk, BPF_SK_LOOKUP_F_REPLACE);
+       if (err)
+               goto out;
+
+       ret = SK_PASS; /* Success, redirect to KEY_SERVER_B */
+out:
+       if (sk)
+               bpf_sk_release(sk);
+       return ret;
+}
+
+/* Check that selected sk is accessible through context. */
+SEC("sk_lookup/access_ctx_sk")
+int access_ctx_sk(struct bpf_sk_lookup *ctx)
+{
+       struct bpf_sock *sk1 = NULL, *sk2 = NULL;
+       int err, ret;
+
+       ret = SK_DROP;
+
+       /* Try accessing unassigned (NULL) ctx->sk field */
+       if (ctx->sk && ctx->sk->family != AF_INET)
+               goto out;
+
+       /* Assign a value to ctx->sk */
+       sk1 = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+       if (!sk1)
+               goto out;
+       err = bpf_sk_assign(ctx, sk1, 0);
+       if (err)
+               goto out;
+       if (ctx->sk != sk1)
+               goto out;
+
+       /* Access ctx->sk fields */
+       if (ctx->sk->family != AF_INET ||
+           ctx->sk->type != SOCK_STREAM ||
+           ctx->sk->state != BPF_TCP_LISTEN)
+               goto out;
+
+       /* Reset selection */
+       err = bpf_sk_assign(ctx, NULL, BPF_SK_LOOKUP_F_REPLACE);
+       if (err)
+               goto out;
+       if (ctx->sk)
+               goto out;
+
+       /* Assign another socket */
+       sk2 = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_B);
+       if (!sk2)
+               goto out;
+       err = bpf_sk_assign(ctx, sk2, BPF_SK_LOOKUP_F_REPLACE);
+       if (err)
+               goto out;
+       if (ctx->sk != sk2)
+               goto out;
+
+       /* Access reassigned ctx->sk fields */
+       if (ctx->sk->family != AF_INET ||
+           ctx->sk->type != SOCK_STREAM ||
+           ctx->sk->state != BPF_TCP_LISTEN)
+               goto out;
+
+       ret = SK_PASS; /* Success, redirect to KEY_SERVER_B */
+out:
+       if (sk1)
+               bpf_sk_release(sk1);
+       if (sk2)
+               bpf_sk_release(sk2);
+       return ret;
+}
+
+/* Check narrow loads from ctx fields that support them.
+ *
+ * Narrow loads of size >= target field size from a non-zero offset
+ * are not covered because they give bogus results, that is the
+ * verifier ignores the offset.
+ */
+SEC("sk_lookup/ctx_narrow_access")
+int ctx_narrow_access(struct bpf_sk_lookup *ctx)
+{
+       struct bpf_sock *sk;
+       int err, family;
+       __u16 *half;
+       __u8 *byte;
+       bool v4;
+
+       v4 = (ctx->family == AF_INET);
+
+       /* Narrow loads from family field */
+       byte = (__u8 *)&ctx->family;
+       half = (__u16 *)&ctx->family;
+       if (byte[0] != (v4 ? AF_INET : AF_INET6) ||
+           byte[1] != 0 || byte[2] != 0 || byte[3] != 0)
+               return SK_DROP;
+       if (half[0] != (v4 ? AF_INET : AF_INET6))
+               return SK_DROP;
+
+       byte = (__u8 *)&ctx->protocol;
+       if (byte[0] != IPPROTO_TCP ||
+           byte[1] != 0 || byte[2] != 0 || byte[3] != 0)
+               return SK_DROP;
+       half = (__u16 *)&ctx->protocol;
+       if (half[0] != IPPROTO_TCP)
+               return SK_DROP;
+
+       /* Narrow loads from remote_port field. Expect non-0 value. */
+       byte = (__u8 *)&ctx->remote_port;
+       if (byte[0] == 0 && byte[1] == 0 && byte[2] == 0 && byte[3] == 0)
+               return SK_DROP;
+       half = (__u16 *)&ctx->remote_port;
+       if (half[0] == 0)
+               return SK_DROP;
+
+       /* Narrow loads from local_port field. Expect DST_PORT. */
+       byte = (__u8 *)&ctx->local_port;
+       if (byte[0] != ((DST_PORT >> 0) & 0xff) ||
+           byte[1] != ((DST_PORT >> 8) & 0xff) ||
+           byte[2] != 0 || byte[3] != 0)
+               return SK_DROP;
+       half = (__u16 *)&ctx->local_port;
+       if (half[0] != DST_PORT)
+               return SK_DROP;
+
+       /* Narrow loads from IPv4 fields */
+       if (v4) {
+               /* Expect non-0.0.0.0 in remote_ip4 */
+               byte = (__u8 *)&ctx->remote_ip4;
+               if (byte[0] == 0 && byte[1] == 0 &&
+                   byte[2] == 0 && byte[3] == 0)
+                       return SK_DROP;
+               half = (__u16 *)&ctx->remote_ip4;
+               if (half[0] == 0 && half[1] == 0)
+                       return SK_DROP;
+
+               /* Expect DST_IP4 in local_ip4 */
+               byte = (__u8 *)&ctx->local_ip4;
+               if (byte[0] != ((DST_IP4 >>  0) & 0xff) ||
+                   byte[1] != ((DST_IP4 >>  8) & 0xff) ||
+                   byte[2] != ((DST_IP4 >> 16) & 0xff) ||
+                   byte[3] != ((DST_IP4 >> 24) & 0xff))
+                       return SK_DROP;
+               half = (__u16 *)&ctx->local_ip4;
+               if (half[0] != ((DST_IP4 >>  0) & 0xffff) ||
+                   half[1] != ((DST_IP4 >> 16) & 0xffff))
+                       return SK_DROP;
+       } else {
+               /* Expect 0.0.0.0 IPs when family != AF_INET */
+               byte = (__u8 *)&ctx->remote_ip4;
+               if (byte[0] != 0 || byte[1] != 0 &&
+                   byte[2] != 0 || byte[3] != 0)
+                       return SK_DROP;
+               half = (__u16 *)&ctx->remote_ip4;
+               if (half[0] != 0 || half[1] != 0)
+                       return SK_DROP;
+
+               byte = (__u8 *)&ctx->local_ip4;
+               if (byte[0] != 0 || byte[1] != 0 &&
+                   byte[2] != 0 || byte[3] != 0)
+                       return SK_DROP;
+               half = (__u16 *)&ctx->local_ip4;
+               if (half[0] != 0 || half[1] != 0)
+                       return SK_DROP;
+       }
+
+       /* Narrow loads from IPv6 fields */
+       if (!v4) {
+               /* Expenct non-:: IP in remote_ip6 */
+               byte = (__u8 *)&ctx->remote_ip6;
+               if (byte[0] == 0 && byte[1] == 0 &&
+                   byte[2] == 0 && byte[3] == 0 &&
+                   byte[4] == 0 && byte[5] == 0 &&
+                   byte[6] == 0 && byte[7] == 0 &&
+                   byte[8] == 0 && byte[9] == 0 &&
+                   byte[10] == 0 && byte[11] == 0 &&
+                   byte[12] == 0 && byte[13] == 0 &&
+                   byte[14] == 0 && byte[15] == 0)
+                       return SK_DROP;
+               half = (__u16 *)&ctx->remote_ip6;
+               if (half[0] == 0 && half[1] == 0 &&
+                   half[2] == 0 && half[3] == 0 &&
+                   half[4] == 0 && half[5] == 0 &&
+                   half[6] == 0 && half[7] == 0)
+                       return SK_DROP;
+
+               /* Expect DST_IP6 in local_ip6 */
+               byte = (__u8 *)&ctx->local_ip6;
+               if (byte[0] != ((DST_IP6[0] >>  0) & 0xff) ||
+                   byte[1] != ((DST_IP6[0] >>  8) & 0xff) ||
+                   byte[2] != ((DST_IP6[0] >> 16) & 0xff) ||
+                   byte[3] != ((DST_IP6[0] >> 24) & 0xff) ||
+                   byte[4] != ((DST_IP6[1] >>  0) & 0xff) ||
+                   byte[5] != ((DST_IP6[1] >>  8) & 0xff) ||
+                   byte[6] != ((DST_IP6[1] >> 16) & 0xff) ||
+                   byte[7] != ((DST_IP6[1] >> 24) & 0xff) ||
+                   byte[8] != ((DST_IP6[2] >>  0) & 0xff) ||
+                   byte[9] != ((DST_IP6[2] >>  8) & 0xff) ||
+                   byte[10] != ((DST_IP6[2] >> 16) & 0xff) ||
+                   byte[11] != ((DST_IP6[2] >> 24) & 0xff) ||
+                   byte[12] != ((DST_IP6[3] >>  0) & 0xff) ||
+                   byte[13] != ((DST_IP6[3] >>  8) & 0xff) ||
+                   byte[14] != ((DST_IP6[3] >> 16) & 0xff) ||
+                   byte[15] != ((DST_IP6[3] >> 24) & 0xff))
+                       return SK_DROP;
+               half = (__u16 *)&ctx->local_ip6;
+               if (half[0] != ((DST_IP6[0] >>  0) & 0xffff) ||
+                   half[1] != ((DST_IP6[0] >> 16) & 0xffff) ||
+                   half[2] != ((DST_IP6[1] >>  0) & 0xffff) ||
+                   half[3] != ((DST_IP6[1] >> 16) & 0xffff) ||
+                   half[4] != ((DST_IP6[2] >>  0) & 0xffff) ||
+                   half[5] != ((DST_IP6[2] >> 16) & 0xffff) ||
+                   half[6] != ((DST_IP6[3] >>  0) & 0xffff) ||
+                   half[7] != ((DST_IP6[3] >> 16) & 0xffff))
+                       return SK_DROP;
+       } else {
+               /* Expect :: IPs when family != AF_INET6 */
+               byte = (__u8 *)&ctx->remote_ip6;
+               if (byte[0] != 0 || byte[1] != 0 ||
+                   byte[2] != 0 || byte[3] != 0 ||
+                   byte[4] != 0 || byte[5] != 0 ||
+                   byte[6] != 0 || byte[7] != 0 ||
+                   byte[8] != 0 || byte[9] != 0 ||
+                   byte[10] != 0 || byte[11] != 0 ||
+                   byte[12] != 0 || byte[13] != 0 ||
+                   byte[14] != 0 || byte[15] != 0)
+                       return SK_DROP;
+               half = (__u16 *)&ctx->remote_ip6;
+               if (half[0] != 0 || half[1] != 0 ||
+                   half[2] != 0 || half[3] != 0 ||
+                   half[4] != 0 || half[5] != 0 ||
+                   half[6] != 0 || half[7] != 0)
+                       return SK_DROP;
+
+               byte = (__u8 *)&ctx->local_ip6;
+               if (byte[0] != 0 || byte[1] != 0 ||
+                   byte[2] != 0 || byte[3] != 0 ||
+                   byte[4] != 0 || byte[5] != 0 ||
+                   byte[6] != 0 || byte[7] != 0 ||
+                   byte[8] != 0 || byte[9] != 0 ||
+                   byte[10] != 0 || byte[11] != 0 ||
+                   byte[12] != 0 || byte[13] != 0 ||
+                   byte[14] != 0 || byte[15] != 0)
+                       return SK_DROP;
+               half = (__u16 *)&ctx->local_ip6;
+               if (half[0] != 0 || half[1] != 0 ||
+                   half[2] != 0 || half[3] != 0 ||
+                   half[4] != 0 || half[5] != 0 ||
+                   half[6] != 0 || half[7] != 0)
+                       return SK_DROP;
+       }
+
+       /* Success, redirect to KEY_SERVER_B */
+       sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_B);
+       if (sk) {
+               bpf_sk_assign(ctx, sk, 0);
+               bpf_sk_release(sk);
+       }
+       return SK_PASS;
+}
+
+/* Check that sk_assign rejects SERVER_A socket with -ESOCKNOSUPPORT */
+SEC("sk_lookup/sk_assign_esocknosupport")
+int sk_assign_esocknosupport(struct bpf_sk_lookup *ctx)
+{
+       struct bpf_sock *sk;
+       int err, ret;
+
+       ret = SK_DROP;
+       sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+       if (!sk)
+               goto out;
+
+       err = bpf_sk_assign(ctx, sk, 0);
+       if (err != -ESOCKTNOSUPPORT) {
+               bpf_printk("sk_assign returned %d, expected %d\n",
+                          err, -ESOCKTNOSUPPORT);
+               goto out;
+       }
+
+       ret = SK_PASS; /* Success, pass to regular lookup */
+out:
+       if (sk)
+               bpf_sk_release(sk);
+       return ret;
+}
+
+SEC("sk_lookup/multi_prog_pass1")
+int multi_prog_pass1(struct bpf_sk_lookup *ctx)
+{
+       bpf_map_update_elem(&run_map, &KEY_PROG1, &PROG_DONE, BPF_ANY);
+       return SK_PASS;
+}
+
+SEC("sk_lookup/multi_prog_pass2")
+int multi_prog_pass2(struct bpf_sk_lookup *ctx)
+{
+       bpf_map_update_elem(&run_map, &KEY_PROG2, &PROG_DONE, BPF_ANY);
+       return SK_PASS;
+}
+
+SEC("sk_lookup/multi_prog_drop1")
+int multi_prog_drop1(struct bpf_sk_lookup *ctx)
+{
+       bpf_map_update_elem(&run_map, &KEY_PROG1, &PROG_DONE, BPF_ANY);
+       return SK_DROP;
+}
+
+SEC("sk_lookup/multi_prog_drop2")
+int multi_prog_drop2(struct bpf_sk_lookup *ctx)
+{
+       bpf_map_update_elem(&run_map, &KEY_PROG2, &PROG_DONE, BPF_ANY);
+       return SK_DROP;
+}
+
+static __always_inline int select_server_a(struct bpf_sk_lookup *ctx)
+{
+       struct bpf_sock *sk;
+       int err;
+
+       sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+       if (!sk)
+               return SK_DROP;
+
+       err = bpf_sk_assign(ctx, sk, 0);
+       bpf_sk_release(sk);
+       if (err)
+               return SK_DROP;
+
+       return SK_PASS;
+}
+
+SEC("sk_lookup/multi_prog_redir1")
+int multi_prog_redir1(struct bpf_sk_lookup *ctx)
+{
+       int ret;
+
+       ret = select_server_a(ctx);
+       bpf_map_update_elem(&run_map, &KEY_PROG1, &PROG_DONE, BPF_ANY);
+       return SK_PASS;
+}
+
+SEC("sk_lookup/multi_prog_redir2")
+int multi_prog_redir2(struct bpf_sk_lookup *ctx)
+{
+       int ret;
+
+       ret = select_server_a(ctx);
+       bpf_map_update_elem(&run_map, &KEY_PROG2, &PROG_DONE, BPF_ANY);
+       return SK_PASS;
+}
+
+char _license[] SEC("license") = "Dual BSD/GPL";
+__u32 _version SEC("version") = 1;
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c
new file mode 100644 (file)
index 0000000..59ee4f1
--- /dev/null
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+#define IFINDEX_LO     1
+
+struct {
+       __uint(type, BPF_MAP_TYPE_CPUMAP);
+       __uint(key_size, sizeof(__u32));
+       __uint(value_size, sizeof(struct bpf_cpumap_val));
+       __uint(max_entries, 4);
+} cpu_map SEC(".maps");
+
+SEC("xdp_redir")
+int xdp_redir_prog(struct xdp_md *ctx)
+{
+       return bpf_redirect_map(&cpu_map, 1, 0);
+}
+
+SEC("xdp_dummy")
+int xdp_dummy_prog(struct xdp_md *ctx)
+{
+       return XDP_PASS;
+}
+
+SEC("xdp_cpumap/dummy_cm")
+int xdp_dummy_cm(struct xdp_md *ctx)
+{
+       if (ctx->ingress_ifindex == IFINDEX_LO)
+               return XDP_DROP;
+
+       return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
index 9df0d2a..4f6444b 100755 (executable)
@@ -10,7 +10,13 @@ if [ "$(id -u)" != "0" ]; then
        exit $ksft_skip
 fi
 
-SRC_TREE=../../../../
+if [ "$building_out_of_srctree" ]; then
+       # We are in linux-build/kselftest/bpf
+       OUTPUT=../../
+else
+       # We are in linux/tools/testing/selftests/bpf
+       OUTPUT=../../../../
+fi
 
 test_run()
 {
@@ -19,8 +25,8 @@ test_run()
 
        echo "[ JIT enabled:$1 hardened:$2 ]"
        dmesg -C
-       if [ -f ${SRC_TREE}/lib/test_bpf.ko ]; then
-               insmod ${SRC_TREE}/lib/test_bpf.ko 2> /dev/null
+       if [ -f ${OUTPUT}/lib/test_bpf.ko ]; then
+               insmod ${OUTPUT}/lib/test_bpf.ko 2> /dev/null
                if [ $? -ne 0 ]; then
                        rc=1
                fi
index 785eabf..5620919 100755 (executable)
@@ -140,7 +140,7 @@ ip netns exec ns6 sysctl net.ipv6.conf.veth10.seg6_enabled=1 > /dev/null
 ip netns exec ns6 nc -l -6 -u -d 7330 > $TMP_FILE &
 ip netns exec ns1 bash -c "echo 'foobar' | nc -w0 -6 -u -p 2121 -s fb00::1 fb00::6 7330"
 sleep 5 # wait enough time to ensure the UDP datagram arrived to the last segment
-kill -INT $!
+kill -TERM $!
 
 if [[ $(< $TMP_FILE) != "foobar" ]]; then
        exit 1
diff --git a/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c b/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c
new file mode 100644 (file)
index 0000000..2ad5f97
--- /dev/null
@@ -0,0 +1,492 @@
+{
+       "valid 1,2,4,8-byte reads from bpf_sk_lookup",
+       .insns = {
+               /* 1-byte read from family field */
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, family)),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, family) + 1),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, family) + 2),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, family) + 3),
+               /* 2-byte read from family field */
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, family)),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, family) + 2),
+               /* 4-byte read from family field */
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, family)),
+
+               /* 1-byte read from protocol field */
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, protocol)),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, protocol) + 1),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, protocol) + 2),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, protocol) + 3),
+               /* 2-byte read from protocol field */
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, protocol)),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, protocol) + 2),
+               /* 4-byte read from protocol field */
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, protocol)),
+
+               /* 1-byte read from remote_ip4 field */
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip4)),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip4) + 1),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip4) + 2),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip4) + 3),
+               /* 2-byte read from remote_ip4 field */
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip4)),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip4) + 2),
+               /* 4-byte read from remote_ip4 field */
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip4)),
+
+               /* 1-byte read from remote_ip6 field */
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6)),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 1),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 2),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 3),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 4),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 5),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 6),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 7),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 8),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 9),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 10),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 11),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 12),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 13),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 14),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 15),
+               /* 2-byte read from remote_ip6 field */
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6)),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 2),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 4),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 6),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 8),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 10),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 12),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 14),
+               /* 4-byte read from remote_ip6 field */
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6)),
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 4),
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 8),
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6) + 12),
+
+               /* 1-byte read from remote_port field */
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_port)),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_port) + 1),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_port) + 2),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_port) + 3),
+               /* 2-byte read from remote_port field */
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_port)),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_port) + 2),
+               /* 4-byte read from remote_port field */
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_port)),
+
+               /* 1-byte read from local_ip4 field */
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip4)),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip4) + 1),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip4) + 2),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip4) + 3),
+               /* 2-byte read from local_ip4 field */
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip4)),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip4) + 2),
+               /* 4-byte read from local_ip4 field */
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip4)),
+
+               /* 1-byte read from local_ip6 field */
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6)),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 1),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 2),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 3),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 4),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 5),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 6),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 7),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 8),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 9),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 10),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 11),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 12),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 13),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 14),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 15),
+               /* 2-byte read from local_ip6 field */
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6)),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 2),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 4),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 6),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 8),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 10),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 12),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 14),
+               /* 4-byte read from local_ip6 field */
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6)),
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 4),
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 8),
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6) + 12),
+
+               /* 1-byte read from local_port field */
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_port)),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_port) + 1),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_port) + 2),
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_port) + 3),
+               /* 2-byte read from local_port field */
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_port)),
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_port) + 2),
+               /* 4-byte read from local_port field */
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_port)),
+
+               /* 8-byte read from sk field */
+               BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, sk)),
+
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .result = ACCEPT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+/* invalid 8-byte reads from a 4-byte fields in bpf_sk_lookup */
+{
+       "invalid 8-byte read from bpf_sk_lookup family field",
+       .insns = {
+               BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, family)),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+       "invalid 8-byte read from bpf_sk_lookup protocol field",
+       .insns = {
+               BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, protocol)),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+       "invalid 8-byte read from bpf_sk_lookup remote_ip4 field",
+       .insns = {
+               BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip4)),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+       "invalid 8-byte read from bpf_sk_lookup remote_ip6 field",
+       .insns = {
+               BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_ip6)),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+       "invalid 8-byte read from bpf_sk_lookup remote_port field",
+       .insns = {
+               BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, remote_port)),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+       "invalid 8-byte read from bpf_sk_lookup local_ip4 field",
+       .insns = {
+               BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip4)),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+       "invalid 8-byte read from bpf_sk_lookup local_ip6 field",
+       .insns = {
+               BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_ip6)),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+       "invalid 8-byte read from bpf_sk_lookup local_port field",
+       .insns = {
+               BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, local_port)),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+/* invalid 1,2,4-byte reads from 8-byte fields in bpf_sk_lookup */
+{
+       "invalid 4-byte read from bpf_sk_lookup sk field",
+       .insns = {
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, sk)),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+       "invalid 2-byte read from bpf_sk_lookup sk field",
+       .insns = {
+               BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, sk)),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+       "invalid 1-byte read from bpf_sk_lookup sk field",
+       .insns = {
+               BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+                           offsetof(struct bpf_sk_lookup, sk)),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+/* out of bounds and unaligned reads from bpf_sk_lookup */
+{
+       "invalid 4-byte read past end of bpf_sk_lookup",
+       .insns = {
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+                           sizeof(struct bpf_sk_lookup)),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+       "invalid 4-byte unaligned read from bpf_sk_lookup at odd offset",
+       .insns = {
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 1),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+       "invalid 4-byte unaligned read from bpf_sk_lookup at even offset",
+       .insns = {
+               BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 2),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+/* in-bound and out-of-bound writes to bpf_sk_lookup */
+{
+       "invalid 8-byte write to bpf_sk_lookup",
+       .insns = {
+               BPF_MOV64_IMM(BPF_REG_0, 0xcafe4a11U),
+               BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+       "invalid 4-byte write to bpf_sk_lookup",
+       .insns = {
+               BPF_MOV64_IMM(BPF_REG_0, 0xcafe4a11U),
+               BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+       "invalid 2-byte write to bpf_sk_lookup",
+       .insns = {
+               BPF_MOV64_IMM(BPF_REG_0, 0xcafe4a11U),
+               BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, 0),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+       "invalid 1-byte write to bpf_sk_lookup",
+       .insns = {
+               BPF_MOV64_IMM(BPF_REG_0, 0xcafe4a11U),
+               BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+       "invalid 4-byte write past end of bpf_sk_lookup",
+       .insns = {
+               BPF_MOV64_IMM(BPF_REG_0, 0xcafe4a11U),
+               BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+                           sizeof(struct bpf_sk_lookup)),
+               BPF_MOV32_IMM(BPF_REG_0, 0),
+               BPF_EXIT_INSN(),
+       },
+       .errstr = "invalid bpf_context access",
+       .result = REJECT,
+       .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+       .expected_attach_type = BPF_SK_LOOKUP,
+},