Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
authorJakub Kicinski <kuba@kernel.org>
Mon, 12 Oct 2020 23:16:50 +0000 (16:16 -0700)
committerJakub Kicinski <kuba@kernel.org>
Mon, 12 Oct 2020 23:16:50 +0000 (16:16 -0700)
Alexei Starovoitov says:

====================
pull-request: bpf-next 2020-10-12

The main changes are:

1) The BPF verifier improvements to track register allocation pattern, from Alexei and Yonghong.

2) libbpf relocation support for different size load/store, from Andrii.

3) bpf_redirect_peer() helper and support for inner map array with different max_entries, from Daniel.

4) BPF support for per-cpu variables, form Hao.

5) sockmap improvements, from John.
====================

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
73 files changed:
Documentation/bpf/bpf_devel_QA.rst
MAINTAINERS
drivers/net/veth.c
include/linux/bpf.h
include/linux/bpf_verifier.h
include/linux/btf.h
include/linux/netdevice.h
include/linux/skmsg.h
include/net/tcp.h
include/uapi/linux/bpf.h
kernel/bpf/arraymap.c
kernel/bpf/btf.c
kernel/bpf/hashtab.c
kernel/bpf/helpers.c
kernel/bpf/percpu_freelist.c
kernel/bpf/percpu_freelist.h
kernel/bpf/syscall.c
kernel/bpf/verifier.c
kernel/trace/bpf_trace.c
net/core/dev.c
net/core/filter.c
net/core/skmsg.c
net/core/sock_map.c
net/ipv4/tcp_minisocks.c
net/xdp/xsk_buff_pool.c
net/xdp/xsk_queue.h
net/xdp/xskmap.c
samples/bpf/Makefile
samples/bpf/hbm.c
samples/bpf/xdp_monitor_kern.c
samples/bpf/xdp_monitor_user.c
samples/bpf/xdp_redirect_cpu_user.c
samples/bpf/xdp_sample_pkts_kern.c
samples/bpf/xdp_sample_pkts_user.c
samples/bpf/xdpsock_user.c
tools/include/uapi/linux/bpf.h
tools/lib/bpf/libbpf.c
tools/lib/bpf/xsk.c
tools/testing/selftests/bpf/README.rst
tools/testing/selftests/bpf/prog_tests/align.c
tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c
tools/testing/selftests/bpf/prog_tests/core_autosize.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/ksyms.c
tools/testing/selftests/bpf/prog_tests/ksyms_btf.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/pinning.c
tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
tools/testing/selftests/bpf/prog_tests/test_profiler.c [new file with mode: 0644]
tools/testing/selftests/bpf/prog_tests/xdp_noinline.c
tools/testing/selftests/bpf/progs/connect4_prog.c
tools/testing/selftests/bpf/progs/profiler.h [new file with mode: 0644]
tools/testing/selftests/bpf/progs/profiler.inc.h [new file with mode: 0644]
tools/testing/selftests/bpf/progs/profiler1.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/profiler2.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/profiler3.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/test_btf_map_in_map.c
tools/testing/selftests/bpf/progs/test_core_autosize.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/test_ksyms_btf.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c
tools/testing/selftests/bpf/progs/test_sockmap_kern.h
tools/testing/selftests/bpf/progs/test_tc_neigh.c
tools/testing/selftests/bpf/progs/test_tc_peer.c [new file with mode: 0644]
tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c
tools/testing/selftests/bpf/test_sockmap.c
tools/testing/selftests/bpf/test_tc_neigh.sh [deleted file]
tools/testing/selftests/bpf/test_tc_redirect.sh [new file with mode: 0755]
tools/testing/selftests/bpf/test_tcp_hdr_options.h
tools/testing/selftests/bpf/trace_helpers.c
tools/testing/selftests/bpf/trace_helpers.h
tools/testing/selftests/bpf/verifier/basic.c
tools/testing/selftests/bpf/verifier/direct_packet_access.c
tools/testing/selftests/bpf/verifier/ld_imm64.c
tools/testing/selftests/bpf/verifier/regalloc.c [new file with mode: 0644]

index 75a0dca..5b613d2 100644 (file)
@@ -60,13 +60,13 @@ Q: Where can I find patches currently under discussion for BPF subsystem?
 A: All patches that are Cc'ed to netdev are queued for review under netdev
 patchwork project:
 
-  http://patchwork.ozlabs.org/project/netdev/list/
+  https://patchwork.kernel.org/project/netdevbpf/list/
 
 Those patches which target BPF, are assigned to a 'bpf' delegate for
 further processing from BPF maintainers. The current queue with
 patches under review can be found at:
 
-  https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
+  https://patchwork.kernel.org/project/netdevbpf/list/?delegate=121173
 
 Once the patches have been reviewed by the BPF community as a whole
 and approved by the BPF maintainers, their status in patchwork will be
index c80f87d..78a4655 100644 (file)
@@ -3263,7 +3263,7 @@ M:        Daniel Borkmann <daniel@iogearbox.net>
 R:     Martin KaFai Lau <kafai@fb.com>
 R:     Song Liu <songliubraving@fb.com>
 R:     Yonghong Song <yhs@fb.com>
-R:     Andrii Nakryiko <andriin@fb.com>
+R:     Andrii Nakryiko <andrii@kernel.org>
 R:     John Fastabend <john.fastabend@gmail.com>
 R:     KP Singh <kpsingh@chromium.org>
 L:     netdev@vger.kernel.org
index 091e5b4..8c73766 100644 (file)
@@ -420,6 +420,14 @@ static int veth_select_rxq(struct net_device *dev)
        return smp_processor_id() % dev->real_num_rx_queues;
 }
 
+static struct net_device *veth_peer_dev(struct net_device *dev)
+{
+       struct veth_priv *priv = netdev_priv(dev);
+
+       /* Callers must be under RCU read side. */
+       return rcu_dereference(priv->peer);
+}
+
 static int veth_xdp_xmit(struct net_device *dev, int n,
                         struct xdp_frame **frames,
                         u32 flags, bool ndo_xmit)
@@ -1224,6 +1232,7 @@ static const struct net_device_ops veth_netdev_ops = {
        .ndo_set_rx_headroom    = veth_set_rx_headroom,
        .ndo_bpf                = veth_xdp,
        .ndo_xdp_xmit           = veth_ndo_xdp_xmit,
+       .ndo_get_peer_dev       = veth_peer_dev,
 };
 
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
index 50e5c4b..2b16bf4 100644 (file)
@@ -82,7 +82,7 @@ struct bpf_map_ops {
        void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
                                int fd);
        void (*map_fd_put_ptr)(void *ptr);
-       u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
+       int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
        u32 (*map_fd_sys_lookup_elem)(void *ptr);
        void (*map_seq_show_elem)(struct bpf_map *map, void *key,
                                  struct seq_file *m);
@@ -293,6 +293,7 @@ enum bpf_arg_type {
        ARG_PTR_TO_ALLOC_MEM_OR_NULL,   /* pointer to dynamically allocated memory or NULL */
        ARG_CONST_ALLOC_SIZE_OR_ZERO,   /* number of allocated bytes requested */
        ARG_PTR_TO_BTF_ID_SOCK_COMMON,  /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
+       ARG_PTR_TO_PERCPU_BTF_ID,       /* pointer to in-kernel percpu type */
        __BPF_ARG_TYPE_MAX,
 };
 
@@ -307,6 +308,8 @@ enum bpf_return_type {
        RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */
        RET_PTR_TO_ALLOC_MEM_OR_NULL,   /* returns a pointer to dynamically allocated memory or NULL */
        RET_PTR_TO_BTF_ID_OR_NULL,      /* returns a pointer to a btf_id or NULL */
+       RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */
+       RET_PTR_TO_MEM_OR_BTF_ID,       /* returns a pointer to a valid memory or a btf_id */
 };
 
 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
@@ -405,6 +408,7 @@ enum bpf_reg_type {
        PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */
        PTR_TO_RDWR_BUF,         /* reg points to a read/write buffer */
        PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
+       PTR_TO_PERCPU_BTF_ID,    /* reg points to a percpu kernel variable */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -1828,6 +1832,8 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto;
 extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto;
 extern const struct bpf_func_proto bpf_copy_from_user_proto;
 extern const struct bpf_func_proto bpf_snprintf_btf_proto;
+extern const struct bpf_func_proto bpf_per_cpu_ptr_proto;
+extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
 
 const struct bpf_func_proto *bpf_tracing_func_proto(
        enum bpf_func_id func_id, const struct bpf_prog *prog);
index 363b4f1..e83ef6f 100644 (file)
@@ -308,6 +308,13 @@ struct bpf_insn_aux_data {
                        u32 map_index;          /* index into used_maps[] */
                        u32 map_off;            /* offset from value base address */
                };
+               struct {
+                       enum bpf_reg_type reg_type;     /* type of pseudo_btf_id */
+                       union {
+                               u32 btf_id;     /* btf_id for struct typed var */
+                               u32 mem_size;   /* mem_size for non-struct typed var */
+                       };
+               } btf_var;
        };
        u64 map_key_state; /* constant (32 bit) key tracking for maps */
        int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
index 024e16f..2bf6418 100644 (file)
@@ -110,6 +110,11 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type,
             i < btf_type_vlen(struct_type);                    \
             i++, member++)
 
+#define for_each_vsi(i, datasec_type, member)                  \
+       for (i = 0, member = btf_type_var_secinfo(datasec_type);        \
+            i < btf_type_vlen(datasec_type);                   \
+            i++, member++)
+
 static inline bool btf_type_is_ptr(const struct btf_type *t)
 {
        return BTF_INFO_KIND(t->info) == BTF_KIND_PTR;
@@ -145,6 +150,21 @@ static inline bool btf_type_is_func_proto(const struct btf_type *t)
        return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO;
 }
 
+static inline bool btf_type_is_var(const struct btf_type *t)
+{
+       return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
+}
+
+/* union is only a special case of struct:
+ * all its offsetof(member) == 0
+ */
+static inline bool btf_type_is_struct(const struct btf_type *t)
+{
+       u8 kind = BTF_INFO_KIND(t->info);
+
+       return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
+}
+
 static inline u16 btf_type_vlen(const struct btf_type *t)
 {
        return BTF_INFO_VLEN(t->info);
@@ -179,6 +199,12 @@ static inline const struct btf_member *btf_type_member(const struct btf_type *t)
        return (const struct btf_member *)(t + 1);
 }
 
+static inline const struct btf_var_secinfo *btf_type_var_secinfo(
+               const struct btf_type *t)
+{
+       return (const struct btf_var_secinfo *)(t + 1);
+}
+
 #ifdef CONFIG_BPF_SYSCALL
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
 const char *btf_name_by_offset(const struct btf *btf, u32 offset);
index a0df43b..948d710 100644 (file)
@@ -1276,6 +1276,9 @@ struct netdev_net_notifier {
  * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p,
  *                      int cmd);
  *     Add, change, delete or get information on an IPv4 tunnel.
+ * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
+ *     If a device is paired with a peer device, return the peer instance.
+ *     The caller must be under RCU read context.
  */
 struct net_device_ops {
        int                     (*ndo_init)(struct net_device *dev);
@@ -1483,6 +1486,7 @@ struct net_device_ops {
        struct devlink_port *   (*ndo_get_devlink_port)(struct net_device *dev);
        int                     (*ndo_tunnel_ctl)(struct net_device *dev,
                                                  struct ip_tunnel_parm *p, int cmd);
+       struct net_device *     (*ndo_get_peer_dev)(struct net_device *dev);
 };
 
 /**
index 3119928..fec0c5a 100644 (file)
@@ -308,6 +308,8 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node);
 int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
 void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock);
 void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock);
+void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock);
+void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock);
 
 int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
                         struct sk_msg *msg);
index 3601dea..d4ef5bf 100644 (file)
@@ -2228,34 +2228,6 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 #endif /* CONFIG_NET_SOCK_MSG */
 
 #ifdef CONFIG_CGROUP_BPF
-/* Copy the listen sk's HDR_OPT_CB flags to its child.
- *
- * During 3-Way-HandShake, the synack is usually sent from
- * the listen sk with the HDR_OPT_CB flags set so that
- * bpf-prog will be called to write the BPF hdr option.
- *
- * In fastopen, the child sk is used to send synack instead
- * of the listen sk.  Thus, inheriting the HDR_OPT_CB flags
- * from the listen sk gives the bpf-prog a chance to write
- * BPF hdr option in the synack pkt during fastopen.
- *
- * Both fastopen and non-fastopen child will inherit the
- * HDR_OPT_CB flags to keep the bpf-prog having a consistent
- * behavior when deciding to clear this cb flags (or not)
- * during the PASSIVE_ESTABLISHED_CB.
- *
- * In the future, other cb flags could be inherited here also.
- */
-static inline void bpf_skops_init_child(const struct sock *sk,
-                                       struct sock *child)
-{
-       tcp_sk(child)->bpf_sock_ops_cb_flags =
-               tcp_sk(sk)->bpf_sock_ops_cb_flags &
-               (BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG |
-                BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG |
-                BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG);
-}
-
 static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
                                      struct sk_buff *skb,
                                      unsigned int end_offset)
@@ -2264,11 +2236,6 @@ static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
        skops->skb_data_end = skb->data + end_offset;
 }
 #else
-static inline void bpf_skops_init_child(const struct sock *sk,
-                                       struct sock *child)
-{
-}
-
 static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
                                      struct sk_buff *skb,
                                      unsigned int end_offset)
index 4f556cf..bf5a99d 100644 (file)
@@ -356,18 +356,36 @@ enum bpf_link_type {
 #define BPF_F_SLEEPABLE                (1U << 4)
 
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
- * two extensions:
- *
- * insn[0].src_reg:  BPF_PSEUDO_MAP_FD   BPF_PSEUDO_MAP_VALUE
- * insn[0].imm:      map fd              map fd
- * insn[1].imm:      0                   offset into value
- * insn[0].off:      0                   0
- * insn[1].off:      0                   0
- * ldimm64 rewrite:  address of map      address of map[0]+offset
- * verifier type:    CONST_PTR_TO_MAP    PTR_TO_MAP_VALUE
+ * the following extensions:
+ *
+ * insn[0].src_reg:  BPF_PSEUDO_MAP_FD
+ * insn[0].imm:      map fd
+ * insn[1].imm:      0
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of map
+ * verifier type:    CONST_PTR_TO_MAP
  */
 #define BPF_PSEUDO_MAP_FD      1
+/* insn[0].src_reg:  BPF_PSEUDO_MAP_VALUE
+ * insn[0].imm:      map fd
+ * insn[1].imm:      offset into value
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of map[0]+offset
+ * verifier type:    PTR_TO_MAP_VALUE
+ */
 #define BPF_PSEUDO_MAP_VALUE   2
+/* insn[0].src_reg:  BPF_PSEUDO_BTF_ID
+ * insn[0].imm:      kernel btd id of VAR
+ * insn[1].imm:      0
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of the kernel variable
+ * verifier type:    PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var
+ *                   is struct/union.
+ */
+#define BPF_PSEUDO_BTF_ID      3
 
 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
  * offset to another bpf function
@@ -417,6 +435,9 @@ enum {
 
 /* Share perf_event among processes */
        BPF_F_PRESERVE_ELEMS    = (1U << 11),
+
+/* Create a map that is suitable to be an inner map with dynamic max entries */
+       BPF_F_INNER_MAP         = (1U << 12),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1680,7 +1701,7 @@ union bpf_attr {
  *               **TCP_CONGESTION**, **TCP_BPF_IW**,
  *               **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**,
  *               **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**,
- *               **TCP_SYNCNT**, **TCP_USER_TIMEOUT**.
+ *               **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**.
  *             * **IPPROTO_IP**, which supports *optname* **IP_TOS**.
  *             * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
  *     Return
@@ -2235,7 +2256,7 @@ union bpf_attr {
  *     Description
  *             This helper is used in programs implementing policies at the
  *             skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
- *             if the verdeict eBPF program returns **SK_PASS**), redirect it
+ *             if the verdict eBPF program returns **SK_PASS**), redirect it
  *             to the socket referenced by *map* (of type
  *             **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
  *             egress interfaces can be used for redirection. The
@@ -3661,10 +3682,59 @@ union bpf_attr {
  *             Redirect the packet to another net device of index *ifindex*
  *             and fill in L2 addresses from neighboring subsystem. This helper
  *             is somewhat similar to **bpf_redirect**\ (), except that it
- *             fills in e.g. MAC addresses based on the L3 information from
- *             the packet. This helper is supported for IPv4 and IPv6 protocols.
+ *             populates L2 addresses as well, meaning, internally, the helper
+ *             performs a FIB lookup based on the skb's networking header to
+ *             get the address of the next hop and then relies on the neighbor
+ *             lookup for the L2 address of the nexthop.
+ *
+ *             The *flags* argument is reserved and must be 0. The helper is
+ *             currently only supported for tc BPF program types, and enabled
+ *             for IPv4 and IPv6 protocols.
+ *     Return
+ *             The helper returns **TC_ACT_REDIRECT** on success or
+ *             **TC_ACT_SHOT** on error.
+ *
+ * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu)
+ *     Description
+ *             Take a pointer to a percpu ksym, *percpu_ptr*, and return a
+ *             pointer to the percpu kernel variable on *cpu*. A ksym is an
+ *             extern variable decorated with '__ksym'. For ksym, there is a
+ *             global var (either static or global) defined of the same name
+ *             in the kernel. The ksym is percpu if the global var is percpu.
+ *             The returned pointer points to the global percpu var on *cpu*.
+ *
+ *             bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the
+ *             kernel, except that bpf_per_cpu_ptr() may return NULL. This
+ *             happens if *cpu* is larger than nr_cpu_ids. The caller of
+ *             bpf_per_cpu_ptr() must check the returned value.
+ *     Return
+ *             A pointer pointing to the kernel percpu variable on *cpu*, or
+ *             NULL, if *cpu* is invalid.
+ *
+ * void *bpf_this_cpu_ptr(const void *percpu_ptr)
+ *     Description
+ *             Take a pointer to a percpu ksym, *percpu_ptr*, and return a
+ *             pointer to the percpu kernel variable on this cpu. See the
+ *             description of 'ksym' in **bpf_per_cpu_ptr**\ ().
+ *
+ *             bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in
+ *             the kernel. Different from **bpf_per_cpu_ptr**\ (), it would
+ *             never return NULL.
+ *     Return
+ *             A pointer pointing to the kernel percpu variable on this cpu.
+ *
+ * long bpf_redirect_peer(u32 ifindex, u64 flags)
+ *     Description
+ *             Redirect the packet to another net device of index *ifindex*.
+ *             This helper is somewhat similar to **bpf_redirect**\ (), except
+ *             that the redirection happens to the *ifindex*' peer device and
+ *             the netns switch takes place from ingress to ingress without
+ *             going through the CPU's backlog queue.
+ *
  *             The *flags* argument is reserved and must be 0. The helper is
- *             currently only supported for tc BPF program types.
+ *             currently only supported for tc BPF program types at the ingress
+ *             hook and for veth device types. The peer device must reside in a
+ *             different network namespace.
  *     Return
  *             The helper returns **TC_ACT_REDIRECT** on success or
  *             **TC_ACT_SHOT** on error.
@@ -3823,6 +3893,9 @@ union bpf_attr {
        FN(seq_printf_btf),             \
        FN(skb_cgroup_classid),         \
        FN(redirect_neigh),             \
+       FN(bpf_per_cpu_ptr),            \
+       FN(bpf_this_cpu_ptr),           \
+       FN(redirect_peer),              \
        /* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
index bd777dd..c6c81ec 100644 (file)
@@ -16,7 +16,7 @@
 
 #define ARRAY_CREATE_FLAG_MASK \
        (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \
-        BPF_F_PRESERVE_ELEMS)
+        BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP)
 
 static void bpf_array_free_percpu(struct bpf_array *array)
 {
@@ -62,7 +62,7 @@ int array_map_alloc_check(union bpf_attr *attr)
                return -EINVAL;
 
        if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
-           attr->map_flags & BPF_F_MMAPABLE)
+           attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP))
                return -EINVAL;
 
        if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
@@ -214,7 +214,7 @@ static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
 }
 
 /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
-static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 {
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct bpf_insn *insn = insn_buf;
@@ -223,6 +223,9 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
        const int map_ptr = BPF_REG_1;
        const int index = BPF_REG_2;
 
+       if (map->map_flags & BPF_F_INNER_MAP)
+               return -EOPNOTSUPP;
+
        *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
        *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
        if (!map->bypass_spec_v1) {
@@ -496,8 +499,10 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
 static bool array_map_meta_equal(const struct bpf_map *meta0,
                                 const struct bpf_map *meta1)
 {
-       return meta0->max_entries == meta1->max_entries &&
-               bpf_map_meta_equal(meta0, meta1);
+       if (!bpf_map_meta_equal(meta0, meta1))
+               return false;
+       return meta0->map_flags & BPF_F_INNER_MAP ? true :
+              meta0->max_entries == meta1->max_entries;
 }
 
 struct bpf_iter_seq_array_map_info {
@@ -1251,7 +1256,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
        return READ_ONCE(*inner_map);
 }
 
-static u32 array_of_map_gen_lookup(struct bpf_map *map,
+static int array_of_map_gen_lookup(struct bpf_map *map,
                                   struct bpf_insn *insn_buf)
 {
        struct bpf_array *array = container_of(map, struct bpf_array, map);
index 4d0ee78..ed7d02e 100644 (file)
             i < btf_type_vlen(struct_type);                            \
             i++, member++)
 
-#define for_each_vsi(i, struct_type, member)                   \
-       for (i = 0, member = btf_type_var_secinfo(struct_type); \
-            i < btf_type_vlen(struct_type);                    \
-            i++, member++)
-
 #define for_each_vsi_from(i, from, struct_type, member)                                \
        for (i = from, member = btf_type_var_secinfo(struct_type) + from;       \
             i < btf_type_vlen(struct_type);                                    \
@@ -440,16 +435,6 @@ static bool btf_type_nosize_or_null(const struct btf_type *t)
        return !t || btf_type_nosize(t);
 }
 
-/* union is only a special case of struct:
- * all its offsetof(member) == 0
- */
-static bool btf_type_is_struct(const struct btf_type *t)
-{
-       u8 kind = BTF_INFO_KIND(t->info);
-
-       return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
-}
-
 static bool __btf_type_is_struct(const struct btf_type *t)
 {
        return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
@@ -460,11 +445,6 @@ static bool btf_type_is_array(const struct btf_type *t)
        return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
 }
 
-static bool btf_type_is_var(const struct btf_type *t)
-{
-       return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
-}
-
 static bool btf_type_is_datasec(const struct btf_type *t)
 {
        return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
@@ -613,11 +593,6 @@ static const struct btf_var *btf_type_var(const struct btf_type *t)
        return (const struct btf_var *)(t + 1);
 }
 
-static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t)
-{
-       return (const struct btf_var_secinfo *)(t + 1);
-}
-
 static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
 {
        return kind_ops[BTF_INFO_KIND(t->info)];
index 3395cf1..1815e97 100644 (file)
@@ -612,7 +612,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
  * bpf_prog
  *   __htab_map_lookup_elem
  */
-static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 {
        struct bpf_insn *insn = insn_buf;
        const int ret = BPF_REG_0;
@@ -651,7 +651,7 @@ static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
        return __htab_lru_map_lookup_elem(map, key, false);
 }
 
-static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
+static int htab_lru_map_gen_lookup(struct bpf_map *map,
                                   struct bpf_insn *insn_buf)
 {
        struct bpf_insn *insn = insn_buf;
@@ -2070,7 +2070,7 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
        return READ_ONCE(*inner_map);
 }
 
-static u32 htab_of_map_gen_lookup(struct bpf_map *map,
+static int htab_of_map_gen_lookup(struct bpf_map *map,
                                  struct bpf_insn *insn_buf)
 {
        struct bpf_insn *insn = insn_buf;
index e825441..25520f5 100644 (file)
@@ -623,6 +623,34 @@ const struct bpf_func_proto bpf_copy_from_user_proto = {
        .arg3_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
+{
+       if (cpu >= nr_cpu_ids)
+               return (unsigned long)NULL;
+
+       return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu);
+}
+
+const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
+       .func           = bpf_per_cpu_ptr,
+       .gpl_only       = false,
+       .ret_type       = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL,
+       .arg1_type      = ARG_PTR_TO_PERCPU_BTF_ID,
+       .arg2_type      = ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
+{
+       return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr);
+}
+
+const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
+       .func           = bpf_this_cpu_ptr,
+       .gpl_only       = false,
+       .ret_type       = RET_PTR_TO_MEM_OR_BTF_ID,
+       .arg1_type      = ARG_PTR_TO_PERCPU_BTF_ID,
+};
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@@ -689,6 +717,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)
                return &bpf_snprintf_btf_proto;
        case BPF_FUNC_jiffies64:
                return &bpf_jiffies64_proto;
+       case BPF_FUNC_bpf_per_cpu_ptr:
+               return &bpf_per_cpu_ptr_proto;
+       case BPF_FUNC_bpf_this_cpu_ptr:
+               return &bpf_this_cpu_ptr_proto;
        default:
                break;
        }
index b367430..3d897de 100644 (file)
@@ -17,6 +17,8 @@ int pcpu_freelist_init(struct pcpu_freelist *s)
                raw_spin_lock_init(&head->lock);
                head->first = NULL;
        }
+       raw_spin_lock_init(&s->extralist.lock);
+       s->extralist.first = NULL;
        return 0;
 }
 
@@ -40,12 +42,50 @@ static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
        raw_spin_unlock(&head->lock);
 }
 
+static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s,
+                                               struct pcpu_freelist_node *node)
+{
+       if (!raw_spin_trylock(&s->extralist.lock))
+               return false;
+
+       pcpu_freelist_push_node(&s->extralist, node);
+       raw_spin_unlock(&s->extralist.lock);
+       return true;
+}
+
+static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
+                                            struct pcpu_freelist_node *node)
+{
+       int cpu, orig_cpu;
+
+       orig_cpu = cpu = raw_smp_processor_id();
+       while (1) {
+               struct pcpu_freelist_head *head;
+
+               head = per_cpu_ptr(s->freelist, cpu);
+               if (raw_spin_trylock(&head->lock)) {
+                       pcpu_freelist_push_node(head, node);
+                       raw_spin_unlock(&head->lock);
+                       return;
+               }
+               cpu = cpumask_next(cpu, cpu_possible_mask);
+               if (cpu >= nr_cpu_ids)
+                       cpu = 0;
+
+               /* cannot lock any per cpu lock, try extralist */
+               if (cpu == orig_cpu &&
+                   pcpu_freelist_try_push_extra(s, node))
+                       return;
+       }
+}
+
 void __pcpu_freelist_push(struct pcpu_freelist *s,
                        struct pcpu_freelist_node *node)
 {
-       struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
-
-       ___pcpu_freelist_push(head, node);
+       if (in_nmi())
+               ___pcpu_freelist_push_nmi(s, node);
+       else
+               ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node);
 }
 
 void pcpu_freelist_push(struct pcpu_freelist *s,
@@ -81,7 +121,7 @@ again:
        }
 }
 
-struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
+static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
 {
        struct pcpu_freelist_head *head;
        struct pcpu_freelist_node *node;
@@ -102,8 +142,59 @@ struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
                if (cpu >= nr_cpu_ids)
                        cpu = 0;
                if (cpu == orig_cpu)
-                       return NULL;
+                       break;
+       }
+
+       /* per cpu lists are all empty, try extralist */
+       raw_spin_lock(&s->extralist.lock);
+       node = s->extralist.first;
+       if (node)
+               s->extralist.first = node->next;
+       raw_spin_unlock(&s->extralist.lock);
+       return node;
+}
+
+static struct pcpu_freelist_node *
+___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
+{
+       struct pcpu_freelist_head *head;
+       struct pcpu_freelist_node *node;
+       int orig_cpu, cpu;
+
+       orig_cpu = cpu = raw_smp_processor_id();
+       while (1) {
+               head = per_cpu_ptr(s->freelist, cpu);
+               if (raw_spin_trylock(&head->lock)) {
+                       node = head->first;
+                       if (node) {
+                               head->first = node->next;
+                               raw_spin_unlock(&head->lock);
+                               return node;
+                       }
+                       raw_spin_unlock(&head->lock);
+               }
+               cpu = cpumask_next(cpu, cpu_possible_mask);
+               if (cpu >= nr_cpu_ids)
+                       cpu = 0;
+               if (cpu == orig_cpu)
+                       break;
        }
+
+       /* cannot pop from per cpu lists, try extralist */
+       if (!raw_spin_trylock(&s->extralist.lock))
+               return NULL;
+       node = s->extralist.first;
+       if (node)
+               s->extralist.first = node->next;
+       raw_spin_unlock(&s->extralist.lock);
+       return node;
+}
+
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+       if (in_nmi())
+               return ___pcpu_freelist_pop_nmi(s);
+       return ___pcpu_freelist_pop(s);
 }
 
 struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
index fbf8a8a..3c76553 100644 (file)
@@ -13,6 +13,7 @@ struct pcpu_freelist_head {
 
 struct pcpu_freelist {
        struct pcpu_freelist_head __percpu *freelist;
+       struct pcpu_freelist_head extralist;
 };
 
 struct pcpu_freelist_node {
index f1528c2..1110ecd 100644 (file)
@@ -4323,8 +4323,10 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
        used_maps_old = prog->aux->used_maps;
 
        for (i = 0; i < prog->aux->used_map_cnt; i++)
-               if (used_maps_old[i] == map)
+               if (used_maps_old[i] == map) {
+                       bpf_map_put(map);
                        goto out_unlock;
+               }
 
        used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
                                      sizeof(used_maps_new[0]),
index 3923c57..c43a5e8 100644 (file)
@@ -238,6 +238,8 @@ struct bpf_call_arg_meta {
        u64 msize_max_value;
        int ref_obj_id;
        int func_id;
+       u32 btf_id;
+       u32 ret_btf_id;
 };
 
 struct btf *btf_vmlinux;
@@ -517,6 +519,7 @@ static const char * const reg_type_str[] = {
        [PTR_TO_XDP_SOCK]       = "xdp_sock",
        [PTR_TO_BTF_ID]         = "ptr_",
        [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
+       [PTR_TO_PERCPU_BTF_ID]  = "percpu_ptr_",
        [PTR_TO_MEM]            = "mem",
        [PTR_TO_MEM_OR_NULL]    = "mem_or_null",
        [PTR_TO_RDONLY_BUF]     = "rdonly_buf",
@@ -583,7 +586,9 @@ static void print_verifier_state(struct bpf_verifier_env *env,
                        /* reg->off should be 0 for SCALAR_VALUE */
                        verbose(env, "%lld", reg->var_off.value + reg->off);
                } else {
-                       if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL)
+                       if (t == PTR_TO_BTF_ID ||
+                           t == PTR_TO_BTF_ID_OR_NULL ||
+                           t == PTR_TO_PERCPU_BTF_ID)
                                verbose(env, "%s", kernel_type_name(reg->btf_id));
                        verbose(env, "(id=%d", reg->id);
                        if (reg_type_may_be_refcounted_or_null(t))
@@ -2204,6 +2209,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
        case PTR_TO_RDONLY_BUF_OR_NULL:
        case PTR_TO_RDWR_BUF:
        case PTR_TO_RDWR_BUF_OR_NULL:
+       case PTR_TO_PERCPU_BTF_ID:
                return true;
        default:
                return false;
@@ -2221,6 +2227,20 @@ static bool register_is_const(struct bpf_reg_state *reg)
        return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off);
 }
 
+static bool __is_scalar_unbounded(struct bpf_reg_state *reg)
+{
+       return tnum_is_unknown(reg->var_off) &&
+              reg->smin_value == S64_MIN && reg->smax_value == S64_MAX &&
+              reg->umin_value == 0 && reg->umax_value == U64_MAX &&
+              reg->s32_min_value == S32_MIN && reg->s32_max_value == S32_MAX &&
+              reg->u32_min_value == 0 && reg->u32_max_value == U32_MAX;
+}
+
+static bool register_is_bounded(struct bpf_reg_state *reg)
+{
+       return reg->type == SCALAR_VALUE && !__is_scalar_unbounded(reg);
+}
+
 static bool __is_pointer_value(bool allow_ptr_leaks,
                               const struct bpf_reg_state *reg)
 {
@@ -2272,7 +2292,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
        if (value_regno >= 0)
                reg = &cur->regs[value_regno];
 
-       if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
+       if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) &&
            !register_is_null(reg) && env->bpf_capable) {
                if (dst_reg != BPF_REG_FP) {
                        /* The backtracking logic can only recognize explicit
@@ -2667,7 +2687,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
        case BPF_PROG_TYPE_CGROUP_SKB:
                if (t == BPF_WRITE)
                        return false;
-               /* fallthrough */
+               fallthrough;
 
        /* Program types with direct read + write access go here! */
        case BPF_PROG_TYPE_SCHED_CLS:
@@ -3978,6 +3998,7 @@ static const struct bpf_reg_types sock_types = {
        },
 };
 
+#ifdef CONFIG_NET
 static const struct bpf_reg_types btf_id_sock_common_types = {
        .types = {
                PTR_TO_SOCK_COMMON,
@@ -3988,6 +4009,7 @@ static const struct bpf_reg_types btf_id_sock_common_types = {
        },
        .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
 };
+#endif
 
 static const struct bpf_reg_types mem_types = {
        .types = {
@@ -4017,6 +4039,7 @@ static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } };
 static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
 static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
 static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
 
 static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
        [ARG_PTR_TO_MAP_KEY]            = &map_key_value_types,
@@ -4030,7 +4053,9 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
        [ARG_PTR_TO_CTX]                = &context_types,
        [ARG_PTR_TO_CTX_OR_NULL]        = &context_types,
        [ARG_PTR_TO_SOCK_COMMON]        = &sock_types,
+#ifdef CONFIG_NET
        [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types,
+#endif
        [ARG_PTR_TO_SOCKET]             = &fullsock_types,
        [ARG_PTR_TO_SOCKET_OR_NULL]     = &fullsock_types,
        [ARG_PTR_TO_BTF_ID]             = &btf_ptr_types,
@@ -4042,6 +4067,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
        [ARG_PTR_TO_ALLOC_MEM_OR_NULL]  = &alloc_mem_types,
        [ARG_PTR_TO_INT]                = &int_ptr_types,
        [ARG_PTR_TO_LONG]               = &int_ptr_types,
+       [ARG_PTR_TO_PERCPU_BTF_ID]      = &percpu_btf_ptr_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -4205,6 +4231,12 @@ skip_type_check:
                err = check_helper_mem_access(env, regno,
                                              meta->map_ptr->value_size, false,
                                              meta);
+       } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) {
+               if (!reg->btf_id) {
+                       verbose(env, "Helper has invalid btf_id in R%d\n", regno);
+                       return -EACCES;
+               }
+               meta->ret_btf_id = reg->btf_id;
        } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
                if (meta->func_id == BPF_FUNC_spin_lock) {
                        if (process_spin_lock(env, regno, true))
@@ -5114,6 +5146,35 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
                regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
                regs[BPF_REG_0].id = ++env->id_gen;
                regs[BPF_REG_0].mem_size = meta.mem_size;
+       } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL ||
+                  fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) {
+               const struct btf_type *t;
+
+               mark_reg_known_zero(env, regs, BPF_REG_0);
+               t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL);
+               if (!btf_type_is_struct(t)) {
+                       u32 tsize;
+                       const struct btf_type *ret;
+                       const char *tname;
+
+                       /* resolve the type size of ksym. */
+                       ret = btf_resolve_size(btf_vmlinux, t, &tsize);
+                       if (IS_ERR(ret)) {
+                               tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+                               verbose(env, "unable to resolve the size of type '%s': %ld\n",
+                                       tname, PTR_ERR(ret));
+                               return -EINVAL;
+                       }
+                       regs[BPF_REG_0].type =
+                               fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
+                               PTR_TO_MEM : PTR_TO_MEM_OR_NULL;
+                       regs[BPF_REG_0].mem_size = tsize;
+               } else {
+                       regs[BPF_REG_0].type =
+                               fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
+                               PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
+                       regs[BPF_REG_0].btf_id = meta.ret_btf_id;
+               }
        } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) {
                int ret_btf_id;
 
@@ -5432,7 +5493,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
                /* smin_val represents the known value */
                if (known && smin_val == 0 && opcode == BPF_ADD)
                        break;
-               /* fall-through */
+               fallthrough;
        case PTR_TO_PACKET_END:
        case PTR_TO_SOCKET:
        case PTR_TO_SOCKET_OR_NULL:
@@ -6389,6 +6450,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
        src_reg = NULL;
        if (dst_reg->type != SCALAR_VALUE)
                ptr_reg = dst_reg;
+       else
+               /* Make sure ID is cleared otherwise dst_reg min/max could be
+                * incorrectly propagated into other registers by find_equal_scalars()
+                */
+               dst_reg->id = 0;
        if (BPF_SRC(insn->code) == BPF_X) {
                src_reg = &regs[insn->src_reg];
                if (src_reg->type != SCALAR_VALUE) {
@@ -6522,6 +6588,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                                /* case: R1 = R2
                                 * copy register state to dest reg
                                 */
+                               if (src_reg->type == SCALAR_VALUE && !src_reg->id)
+                                       /* Assign src and dst registers the same ID
+                                        * that will be used by find_equal_scalars()
+                                        * to propagate min/max range.
+                                        */
+                                       src_reg->id = ++env->id_gen;
                                *dst_reg = *src_reg;
                                dst_reg->live |= REG_LIVE_WRITTEN;
                                dst_reg->subreg_def = DEF_NOT_SUBREG;
@@ -6534,6 +6606,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
                                        return -EACCES;
                                } else if (src_reg->type == SCALAR_VALUE) {
                                        *dst_reg = *src_reg;
+                                       /* Make sure ID is cleared otherwise
+                                        * dst_reg min/max could be incorrectly
+                                        * propagated into src_reg by find_equal_scalars()
+                                        */
+                                       dst_reg->id = 0;
                                        dst_reg->live |= REG_LIVE_WRITTEN;
                                        dst_reg->subreg_def = env->insn_idx + 1;
                                } else {
@@ -7322,6 +7399,30 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
        return true;
 }
 
+static void find_equal_scalars(struct bpf_verifier_state *vstate,
+                              struct bpf_reg_state *known_reg)
+{
+       struct bpf_func_state *state;
+       struct bpf_reg_state *reg;
+       int i, j;
+
+       for (i = 0; i <= vstate->curframe; i++) {
+               state = vstate->frame[i];
+               for (j = 0; j < MAX_BPF_REG; j++) {
+                       reg = &state->regs[j];
+                       if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
+                               *reg = *known_reg;
+               }
+
+               bpf_for_each_spilled_reg(j, state, reg) {
+                       if (!reg)
+                               continue;
+                       if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
+                               *reg = *known_reg;
+               }
+       }
+}
+
 static int check_cond_jmp_op(struct bpf_verifier_env *env,
                             struct bpf_insn *insn, int *insn_idx)
 {
@@ -7450,6 +7551,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
                                reg_combine_min_max(&other_branch_regs[insn->src_reg],
                                                    &other_branch_regs[insn->dst_reg],
                                                    src_reg, dst_reg, opcode);
+                       if (src_reg->id) {
+                               find_equal_scalars(this_branch, src_reg);
+                               find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
+                       }
+
                }
        } else if (dst_reg->type == SCALAR_VALUE) {
                reg_set_min_max(&other_branch_regs[insn->dst_reg],
@@ -7457,6 +7563,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
                                        opcode, is_jmp32);
        }
 
+       if (dst_reg->type == SCALAR_VALUE && dst_reg->id) {
+               find_equal_scalars(this_branch, dst_reg);
+               find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
+       }
+
        /* detect if R == 0 where R is returned from bpf_map_lookup_elem().
         * NOTE: these optimizations below are related with pointer comparison
         *       which will never be JMP32.
@@ -7488,6 +7599,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
        struct bpf_insn_aux_data *aux = cur_aux(env);
        struct bpf_reg_state *regs = cur_regs(env);
+       struct bpf_reg_state *dst_reg;
        struct bpf_map *map;
        int err;
 
@@ -7504,25 +7616,45 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
        if (err)
                return err;
 
+       dst_reg = &regs[insn->dst_reg];
        if (insn->src_reg == 0) {
                u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
 
-               regs[insn->dst_reg].type = SCALAR_VALUE;
+               dst_reg->type = SCALAR_VALUE;
                __mark_reg_known(&regs[insn->dst_reg], imm);
                return 0;
        }
 
+       if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
+               mark_reg_known_zero(env, regs, insn->dst_reg);
+
+               dst_reg->type = aux->btf_var.reg_type;
+               switch (dst_reg->type) {
+               case PTR_TO_MEM:
+                       dst_reg->mem_size = aux->btf_var.mem_size;
+                       break;
+               case PTR_TO_BTF_ID:
+               case PTR_TO_PERCPU_BTF_ID:
+                       dst_reg->btf_id = aux->btf_var.btf_id;
+                       break;
+               default:
+                       verbose(env, "bpf verifier is misconfigured\n");
+                       return -EFAULT;
+               }
+               return 0;
+       }
+
        map = env->used_maps[aux->map_index];
        mark_reg_known_zero(env, regs, insn->dst_reg);
-       regs[insn->dst_reg].map_ptr = map;
+       dst_reg->map_ptr = map;
 
        if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
-               regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
-               regs[insn->dst_reg].off = aux->map_off;
+               dst_reg->type = PTR_TO_MAP_VALUE;
+               dst_reg->off = aux->map_off;
                if (map_value_has_spin_lock(map))
-                       regs[insn->dst_reg].id = ++env->id_gen;
+                       dst_reg->id = ++env->id_gen;
        } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
-               regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
+               dst_reg->type = CONST_PTR_TO_MAP;
        } else {
                verbose(env, "bpf verifier is misconfigured\n");
                return -EINVAL;
@@ -9424,6 +9556,92 @@ process_bpf_exit:
        return 0;
 }
 
+/* replace pseudo btf_id with kernel symbol address */
+static int check_pseudo_btf_id(struct bpf_verifier_env *env,
+                              struct bpf_insn *insn,
+                              struct bpf_insn_aux_data *aux)
+{
+       u32 datasec_id, type, id = insn->imm;
+       const struct btf_var_secinfo *vsi;
+       const struct btf_type *datasec;
+       const struct btf_type *t;
+       const char *sym_name;
+       bool percpu = false;
+       u64 addr;
+       int i;
+
+       if (!btf_vmlinux) {
+               verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
+               return -EINVAL;
+       }
+
+       if (insn[1].imm != 0) {
+               verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n");
+               return -EINVAL;
+       }
+
+       t = btf_type_by_id(btf_vmlinux, id);
+       if (!t) {
+               verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
+               return -ENOENT;
+       }
+
+       if (!btf_type_is_var(t)) {
+               verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n",
+                       id);
+               return -EINVAL;
+       }
+
+       sym_name = btf_name_by_offset(btf_vmlinux, t->name_off);
+       addr = kallsyms_lookup_name(sym_name);
+       if (!addr) {
+               verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
+                       sym_name);
+               return -ENOENT;
+       }
+
+       datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu",
+                                          BTF_KIND_DATASEC);
+       if (datasec_id > 0) {
+               datasec = btf_type_by_id(btf_vmlinux, datasec_id);
+               for_each_vsi(i, datasec, vsi) {
+                       if (vsi->type == id) {
+                               percpu = true;
+                               break;
+                       }
+               }
+       }
+
+       insn[0].imm = (u32)addr;
+       insn[1].imm = addr >> 32;
+
+       type = t->type;
+       t = btf_type_skip_modifiers(btf_vmlinux, type, NULL);
+       if (percpu) {
+               aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
+               aux->btf_var.btf_id = type;
+       } else if (!btf_type_is_struct(t)) {
+               const struct btf_type *ret;
+               const char *tname;
+               u32 tsize;
+
+               /* resolve the type size of ksym. */
+               ret = btf_resolve_size(btf_vmlinux, t, &tsize);
+               if (IS_ERR(ret)) {
+                       tname = btf_name_by_offset(btf_vmlinux, t->name_off);
+                       verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
+                               tname, PTR_ERR(ret));
+                       return -EINVAL;
+               }
+               aux->btf_var.reg_type = PTR_TO_MEM;
+               aux->btf_var.mem_size = tsize;
+       } else {
+               aux->btf_var.reg_type = PTR_TO_BTF_ID;
+               aux->btf_var.btf_id = type;
+       }
+       return 0;
+}
+
 static int check_map_prealloc(struct bpf_map *map)
 {
        return (map->map_type != BPF_MAP_TYPE_HASH &&
@@ -9534,10 +9752,14 @@ static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
                map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
 }
 
-/* look for pseudo eBPF instructions that access map FDs and
- * replace them with actual map pointers
+/* find and rewrite pseudo imm in ld_imm64 instructions:
+ *
+ * 1. if it accesses map FD, replace it with actual map pointer.
+ * 2. if it accesses btf_id of a VAR, replace it with pointer to the var.
+ *
+ * NOTE: btf_vmlinux is required for converting pseudo btf_id.
  */
-static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
+static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
 {
        struct bpf_insn *insn = env->prog->insnsi;
        int insn_cnt = env->prog->len;
@@ -9578,6 +9800,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
                                /* valid generic load 64-bit imm */
                                goto next_insn;
 
+                       if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) {
+                               aux = &env->insn_aux_data[i];
+                               err = check_pseudo_btf_id(env, insn, aux);
+                               if (err)
+                                       return err;
+                               goto next_insn;
+                       }
+
                        /* In final convert_pseudo_ld_imm64() step, this is
                         * converted into regular 64-bit imm load insn.
                         */
@@ -10819,7 +11049,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
                        if (insn->imm == BPF_FUNC_map_lookup_elem &&
                            ops->map_gen_lookup) {
                                cnt = ops->map_gen_lookup(map_ptr, insn_buf);
-                               if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+                               if (cnt == -EOPNOTSUPP)
+                                       goto patch_map_ops_generic;
+                               if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) {
                                        verbose(env, "bpf verifier is misconfigured\n");
                                        return -EINVAL;
                                }
@@ -10849,7 +11081,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
                                     (int (*)(struct bpf_map *map, void *value))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
                                     (int (*)(struct bpf_map *map, void *value))NULL));
-
+patch_map_ops_generic:
                        switch (insn->imm) {
                        case BPF_FUNC_map_lookup_elem:
                                insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
@@ -11633,10 +11865,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
        if (is_priv)
                env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
 
-       ret = replace_map_fd_with_map_ptr(env);
-       if (ret < 0)
-               goto skip_full_check;
-
        if (bpf_prog_is_dev_bound(env->prog->aux)) {
                ret = bpf_prog_offload_verifier_prep(env->prog);
                if (ret)
@@ -11662,6 +11890,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
        if (ret)
                goto skip_full_check;
 
+       ret = resolve_pseudo_ldimm64(env);
+       if (ret < 0)
+               goto skip_full_check;
+
        ret = check_cfg(env);
        if (ret < 0)
                goto skip_full_check;
index e118a83..a2a4535 100644 (file)
@@ -1327,6 +1327,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
        case BPF_FUNC_snprintf_btf:
                return &bpf_snprintf_btf_proto;
+       case BPF_FUNC_bpf_per_cpu_ptr:
+               return &bpf_per_cpu_ptr_proto;
+       case BPF_FUNC_bpf_this_cpu_ptr:
+               return &bpf_this_cpu_ptr_proto;
        default:
                return NULL;
        }
@@ -1776,7 +1780,9 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
 };
 
 const struct bpf_prog_ops raw_tracepoint_prog_ops = {
+#ifdef CONFIG_NET
        .test_run = bpf_prog_test_run_raw_tp,
+#endif
 };
 
 const struct bpf_verifier_ops tracing_verifier_ops = {
index a146bac..22babf5 100644 (file)
@@ -4930,7 +4930,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 
 static inline struct sk_buff *
 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
-                  struct net_device *orig_dev)
+                  struct net_device *orig_dev, bool *another)
 {
 #ifdef CONFIG_NET_CLS_ACT
        struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
@@ -4974,7 +4974,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
                 * redirecting to another netdev
                 */
                __skb_push(skb, skb->mac_len);
-               skb_do_redirect(skb);
+               if (skb_do_redirect(skb) == -EAGAIN) {
+                       __skb_pull(skb, skb->mac_len);
+                       *another = true;
+                       break;
+               }
                return NULL;
        case TC_ACT_CONSUMED:
                return NULL;
@@ -5163,7 +5167,12 @@ another_round:
 skip_taps:
 #ifdef CONFIG_NET_INGRESS
        if (static_branch_unlikely(&ingress_needed_key)) {
-               skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
+               bool another = false;
+
+               skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
+                                        &another);
+               if (another)
+                       goto another_round;
                if (!skb)
                        goto out;
 
index bc6bd2b..c5e2a1c 100644 (file)
@@ -76,6 +76,7 @@
 #include <net/bpf_sk_storage.h>
 #include <net/transp_v6.h>
 #include <linux/btf_ids.h>
+#include <net/tls.h>
 
 static const struct bpf_func_proto *
 bpf_sk_base_func_proto(enum bpf_func_id func_id);
@@ -2379,8 +2380,9 @@ out:
 
 /* Internal, non-exposed redirect flags. */
 enum {
-       BPF_F_NEIGH = (1ULL << 1),
-#define BPF_F_REDIRECT_INTERNAL        (BPF_F_NEIGH)
+       BPF_F_NEIGH     = (1ULL << 1),
+       BPF_F_PEER      = (1ULL << 2),
+#define BPF_F_REDIRECT_INTERNAL        (BPF_F_NEIGH | BPF_F_PEER)
 };
 
 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
@@ -2429,19 +2431,35 @@ EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
 int skb_do_redirect(struct sk_buff *skb)
 {
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+       struct net *net = dev_net(skb->dev);
        struct net_device *dev;
        u32 flags = ri->flags;
 
-       dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
+       dev = dev_get_by_index_rcu(net, ri->tgt_index);
        ri->tgt_index = 0;
-       if (unlikely(!dev)) {
-               kfree_skb(skb);
-               return -EINVAL;
+       ri->flags = 0;
+       if (unlikely(!dev))
+               goto out_drop;
+       if (flags & BPF_F_PEER) {
+               const struct net_device_ops *ops = dev->netdev_ops;
+
+               if (unlikely(!ops->ndo_get_peer_dev ||
+                            !skb_at_tc_ingress(skb)))
+                       goto out_drop;
+               dev = ops->ndo_get_peer_dev(dev);
+               if (unlikely(!dev ||
+                            !is_skb_forwardable(dev, skb) ||
+                            net_eq(net, dev_net(dev))))
+                       goto out_drop;
+               skb->dev = dev;
+               return -EAGAIN;
        }
-
        return flags & BPF_F_NEIGH ?
               __bpf_redirect_neigh(skb, dev) :
               __bpf_redirect(skb, dev, flags);
+out_drop:
+       kfree_skb(skb);
+       return -EINVAL;
 }
 
 BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
@@ -2465,6 +2483,27 @@ static const struct bpf_func_proto bpf_redirect_proto = {
        .arg2_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
+{
+       struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+       if (unlikely(flags))
+               return TC_ACT_SHOT;
+
+       ri->flags = BPF_F_PEER;
+       ri->tgt_index = ifindex;
+
+       return TC_ACT_REDIRECT;
+}
+
+static const struct bpf_func_proto bpf_redirect_peer_proto = {
+       .func           = bpf_redirect_peer,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_ANYTHING,
+       .arg2_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags)
 {
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
@@ -3479,6 +3518,48 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb)
                          SKB_MAX_ALLOC;
 }
 
+BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
+          u32, mode, u64, flags)
+{
+       u32 len_diff_abs = abs(len_diff);
+       bool shrink = len_diff < 0;
+       int ret = 0;
+
+       if (unlikely(flags || mode))
+               return -EINVAL;
+       if (unlikely(len_diff_abs > 0xfffU))
+               return -EFAULT;
+
+       if (!shrink) {
+               ret = skb_cow(skb, len_diff);
+               if (unlikely(ret < 0))
+                       return ret;
+               __skb_push(skb, len_diff_abs);
+               memset(skb->data, 0, len_diff_abs);
+       } else {
+               if (unlikely(!pskb_may_pull(skb, len_diff_abs)))
+                       return -ENOMEM;
+               __skb_pull(skb, len_diff_abs);
+       }
+       bpf_compute_data_end_sk_skb(skb);
+       if (tls_sw_has_ctx_rx(skb->sk)) {
+               struct strp_msg *rxm = strp_msg(skb);
+
+               rxm->full_len += len_diff;
+       }
+       return ret;
+}
+
+static const struct bpf_func_proto sk_skb_adjust_room_proto = {
+       .func           = sk_skb_adjust_room,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+       .arg3_type      = ARG_ANYTHING,
+       .arg4_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
           u32, mode, u64, flags)
 {
@@ -4784,6 +4865,10 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
                                else
                                        icsk->icsk_user_timeout = val;
                                break;
+                       case TCP_NOTSENT_LOWAT:
+                               tp->notsent_lowat = val;
+                               sk->sk_write_space(sk);
+                               break;
                        default:
                                ret = -EINVAL;
                        }
@@ -5149,7 +5234,6 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
        memcpy(params->smac, dev->dev_addr, ETH_ALEN);
        params->h_vlan_TCI = 0;
        params->h_vlan_proto = 0;
-       params->ifindex = dev->ifindex;
 
        return 0;
 }
@@ -5246,6 +5330,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
        dev = nhc->nhc_dev;
 
        params->rt_metric = res.fi->fib_priority;
+       params->ifindex = dev->ifindex;
 
        /* xdp and cls_bpf programs are run in RCU-bh so
         * rcu_read_lock_bh is not needed here
@@ -5371,6 +5456,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 
        dev = res.nh->fib_nh_dev;
        params->rt_metric = res.f6i->fib6_metric;
+       params->ifindex = dev->ifindex;
 
        /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
         * not needed here.
@@ -6745,6 +6831,7 @@ bool bpf_helper_changes_pkt_data(void *func)
            func == bpf_skb_change_tail ||
            func == sk_skb_change_tail ||
            func == bpf_skb_adjust_room ||
+           func == sk_skb_adjust_room ||
            func == bpf_skb_pull_data ||
            func == sk_skb_pull_data ||
            func == bpf_clone_redirect ||
@@ -7005,6 +7092,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return &bpf_redirect_proto;
        case BPF_FUNC_redirect_neigh:
                return &bpf_redirect_neigh_proto;
+       case BPF_FUNC_redirect_peer:
+               return &bpf_redirect_peer_proto;
        case BPF_FUNC_get_route_realm:
                return &bpf_get_route_realm_proto;
        case BPF_FUNC_get_hash_recalc:
@@ -7218,6 +7307,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                return &sk_skb_change_tail_proto;
        case BPF_FUNC_skb_change_head:
                return &sk_skb_change_head_proto;
+       case BPF_FUNC_skb_adjust_room:
+               return &sk_skb_adjust_room_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_proto;
        case BPF_FUNC_get_socket_uid:
index 4b5f7c8..654182e 100644 (file)
@@ -433,10 +433,12 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
 static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
                               u32 off, u32 len, bool ingress)
 {
-       if (ingress)
-               return sk_psock_skb_ingress(psock, skb);
-       else
+       if (!ingress) {
+               if (!sock_writeable(psock->sk))
+                       return -EAGAIN;
                return skb_send_sock_locked(psock->sk, skb, off, len);
+       }
+       return sk_psock_skb_ingress(psock, skb);
 }
 
 static void sk_psock_backlog(struct work_struct *work)
@@ -625,6 +627,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
        rcu_assign_sk_user_data(sk, NULL);
        if (psock->progs.skb_parser)
                sk_psock_stop_strp(sk, psock);
+       else if (psock->progs.skb_verdict)
+               sk_psock_stop_verdict(sk, psock);
        write_unlock_bh(&sk->sk_callback_lock);
        sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
 
@@ -682,19 +686,8 @@ EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
 static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
                            struct sk_buff *skb)
 {
-       int ret;
-
-       skb->sk = psock->sk;
        bpf_compute_data_end_sk_skb(skb);
-       ret = bpf_prog_run_pin_on_cpu(prog, skb);
-       /* strparser clones the skb before handing it to a upper layer,
-        * meaning skb_orphan has been called. We NULL sk on the way out
-        * to ensure we don't trigger a BUG_ON() in skb/sk operations
-        * later and because we are not charging the memory of this skb
-        * to any socket yet.
-        */
-       skb->sk = NULL;
-       return ret;
+       return bpf_prog_run_pin_on_cpu(prog, skb);
 }
 
 static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
@@ -709,38 +702,35 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
 {
        struct sk_psock *psock_other;
        struct sock *sk_other;
-       bool ingress;
 
        sk_other = tcp_skb_bpf_redirect_fetch(skb);
+       /* This error is a buggy BPF program, it returned a redirect
+        * return code, but then didn't set a redirect interface.
+        */
        if (unlikely(!sk_other)) {
                kfree_skb(skb);
                return;
        }
        psock_other = sk_psock(sk_other);
+       /* This error indicates the socket is being torn down or had another
+        * error that caused the pipe to break. We can't send a packet on
+        * a socket that is in this state so we drop the skb.
+        */
        if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
            !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
                kfree_skb(skb);
                return;
        }
 
-       ingress = tcp_skb_bpf_ingress(skb);
-       if ((!ingress && sock_writeable(sk_other)) ||
-           (ingress &&
-            atomic_read(&sk_other->sk_rmem_alloc) <=
-            sk_other->sk_rcvbuf)) {
-               if (!ingress)
-                       skb_set_owner_w(skb, sk_other);
-               skb_queue_tail(&psock_other->ingress_skb, skb);
-               schedule_work(&psock_other->work);
-       } else {
-               kfree_skb(skb);
-       }
+       skb_queue_tail(&psock_other->ingress_skb, skb);
+       schedule_work(&psock_other->work);
 }
 
-static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict)
+static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict)
 {
        switch (verdict) {
        case __SK_REDIRECT:
+               skb_set_owner_r(skb, sk);
                sk_psock_skb_redirect(skb);
                break;
        case __SK_PASS:
@@ -758,11 +748,17 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
        rcu_read_lock();
        prog = READ_ONCE(psock->progs.skb_verdict);
        if (likely(prog)) {
+               /* We skip full set_owner_r here because if we do a SK_PASS
+                * or SK_DROP we can skip skb memory accounting and use the
+                * TLS context.
+                */
+               skb->sk = psock->sk;
                tcp_skb_bpf_redirect_clear(skb);
                ret = sk_psock_bpf_run(psock, prog, skb);
                ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+               skb->sk = NULL;
        }
-       sk_psock_tls_verdict_apply(skb, ret);
+       sk_psock_tls_verdict_apply(skb, psock->sk, ret);
        rcu_read_unlock();
        return ret;
 }
@@ -771,7 +767,9 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
 static void sk_psock_verdict_apply(struct sk_psock *psock,
                                   struct sk_buff *skb, int verdict)
 {
+       struct tcp_skb_cb *tcp;
        struct sock *sk_other;
+       int err = -EIO;
 
        switch (verdict) {
        case __SK_PASS:
@@ -780,16 +778,24 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
                    !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
                        goto out_free;
                }
-               if (atomic_read(&sk_other->sk_rmem_alloc) <=
-                   sk_other->sk_rcvbuf) {
-                       struct tcp_skb_cb *tcp = TCP_SKB_CB(skb);
 
-                       tcp->bpf.flags |= BPF_F_INGRESS;
+               tcp = TCP_SKB_CB(skb);
+               tcp->bpf.flags |= BPF_F_INGRESS;
+
+               /* If the queue is empty then we can submit directly
+                * into the msg queue. If its not empty we have to
+                * queue work otherwise we may get OOO data. Otherwise,
+                * if sk_psock_skb_ingress errors will be handled by
+                * retrying later from workqueue.
+                */
+               if (skb_queue_empty(&psock->ingress_skb)) {
+                       err = sk_psock_skb_ingress(psock, skb);
+               }
+               if (err < 0) {
                        skb_queue_tail(&psock->ingress_skb, skb);
                        schedule_work(&psock->work);
-                       break;
                }
-               goto out_free;
+               break;
        case __SK_REDIRECT:
                sk_psock_skb_redirect(skb);
                break;
@@ -814,9 +820,9 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
                kfree_skb(skb);
                goto out;
        }
+       skb_set_owner_r(skb, sk);
        prog = READ_ONCE(psock->progs.skb_verdict);
        if (likely(prog)) {
-               skb_orphan(skb);
                tcp_skb_bpf_redirect_clear(skb);
                ret = sk_psock_bpf_run(psock, prog, skb);
                ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
@@ -839,8 +845,11 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
 
        rcu_read_lock();
        prog = READ_ONCE(psock->progs.skb_parser);
-       if (likely(prog))
+       if (likely(prog)) {
+               skb->sk = psock->sk;
                ret = sk_psock_bpf_run(psock, prog, skb);
+               skb->sk = NULL;
+       }
        rcu_read_unlock();
        return ret;
 }
@@ -864,6 +873,57 @@ static void sk_psock_strp_data_ready(struct sock *sk)
        rcu_read_unlock();
 }
 
+static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
+                                unsigned int offset, size_t orig_len)
+{
+       struct sock *sk = (struct sock *)desc->arg.data;
+       struct sk_psock *psock;
+       struct bpf_prog *prog;
+       int ret = __SK_DROP;
+       int len = skb->len;
+
+       /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */
+       skb = skb_clone(skb, GFP_ATOMIC);
+       if (!skb) {
+               desc->error = -ENOMEM;
+               return 0;
+       }
+
+       rcu_read_lock();
+       psock = sk_psock(sk);
+       if (unlikely(!psock)) {
+               len = 0;
+               kfree_skb(skb);
+               goto out;
+       }
+       skb_set_owner_r(skb, sk);
+       prog = READ_ONCE(psock->progs.skb_verdict);
+       if (likely(prog)) {
+               tcp_skb_bpf_redirect_clear(skb);
+               ret = sk_psock_bpf_run(psock, prog, skb);
+               ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+       }
+       sk_psock_verdict_apply(psock, skb, ret);
+out:
+       rcu_read_unlock();
+       return len;
+}
+
+static void sk_psock_verdict_data_ready(struct sock *sk)
+{
+       struct socket *sock = sk->sk_socket;
+       read_descriptor_t desc;
+
+       if (unlikely(!sock || !sock->ops || !sock->ops->read_sock))
+               return;
+
+       desc.arg.data = sk;
+       desc.error = 0;
+       desc.count = 1;
+
+       sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv);
+}
+
 static void sk_psock_write_space(struct sock *sk)
 {
        struct sk_psock *psock;
@@ -893,6 +953,19 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
        return strp_init(&psock->parser.strp, sk, &cb);
 }
 
+void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
+{
+       struct sk_psock_parser *parser = &psock->parser;
+
+       if (parser->enabled)
+               return;
+
+       parser->saved_data_ready = sk->sk_data_ready;
+       sk->sk_data_ready = sk_psock_verdict_data_ready;
+       sk->sk_write_space = sk_psock_write_space;
+       parser->enabled = true;
+}
+
 void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
 {
        struct sk_psock_parser *parser = &psock->parser;
@@ -918,3 +991,15 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
        strp_stop(&parser->strp);
        parser->enabled = false;
 }
+
+void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
+{
+       struct sk_psock_parser *parser = &psock->parser;
+
+       if (!parser->enabled)
+               return;
+
+       sk->sk_data_ready = parser->saved_data_ready;
+       parser->saved_data_ready = NULL;
+       parser->enabled = false;
+}
index e83a80e..df09c39 100644 (file)
@@ -148,8 +148,8 @@ static void sock_map_add_link(struct sk_psock *psock,
 static void sock_map_del_link(struct sock *sk,
                              struct sk_psock *psock, void *link_raw)
 {
+       bool strp_stop = false, verdict_stop = false;
        struct sk_psock_link *link, *tmp;
-       bool strp_stop = false;
 
        spin_lock_bh(&psock->link_lock);
        list_for_each_entry_safe(link, tmp, &psock->link, list) {
@@ -159,14 +159,19 @@ static void sock_map_del_link(struct sock *sk,
                                                             map);
                        if (psock->parser.enabled && stab->progs.skb_parser)
                                strp_stop = true;
+                       if (psock->parser.enabled && stab->progs.skb_verdict)
+                               verdict_stop = true;
                        list_del(&link->list);
                        sk_psock_free_link(link);
                }
        }
        spin_unlock_bh(&psock->link_lock);
-       if (strp_stop) {
+       if (strp_stop || verdict_stop) {
                write_lock_bh(&sk->sk_callback_lock);
-               sk_psock_stop_strp(sk, psock);
+               if (strp_stop)
+                       sk_psock_stop_strp(sk, psock);
+               else
+                       sk_psock_stop_verdict(sk, psock);
                write_unlock_bh(&sk->sk_callback_lock);
        }
 }
@@ -230,16 +235,16 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 {
        struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
        struct sk_psock *psock;
-       bool skb_progs;
        int ret;
 
        skb_verdict = READ_ONCE(progs->skb_verdict);
        skb_parser = READ_ONCE(progs->skb_parser);
-       skb_progs = skb_parser && skb_verdict;
-       if (skb_progs) {
+       if (skb_verdict) {
                skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
                if (IS_ERR(skb_verdict))
                        return PTR_ERR(skb_verdict);
+       }
+       if (skb_parser) {
                skb_parser = bpf_prog_inc_not_zero(skb_parser);
                if (IS_ERR(skb_parser)) {
                        bpf_prog_put(skb_verdict);
@@ -264,7 +269,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
 
        if (psock) {
                if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
-                   (skb_progs  && READ_ONCE(psock->progs.skb_parser))) {
+                   (skb_parser  && READ_ONCE(psock->progs.skb_parser)) ||
+                   (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) {
                        sk_psock_put(sk, psock);
                        ret = -EBUSY;
                        goto out_progs;
@@ -285,28 +291,31 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
                goto out_drop;
 
        write_lock_bh(&sk->sk_callback_lock);
-       if (skb_progs && !psock->parser.enabled) {
+       if (skb_parser && skb_verdict && !psock->parser.enabled) {
                ret = sk_psock_init_strp(sk, psock);
-               if (ret) {
-                       write_unlock_bh(&sk->sk_callback_lock);
-                       goto out_drop;
-               }
+               if (ret)
+                       goto out_unlock_drop;
                psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
                psock_set_prog(&psock->progs.skb_parser, skb_parser);
                sk_psock_start_strp(sk, psock);
+       } else if (!skb_parser && skb_verdict && !psock->parser.enabled) {
+               psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+               sk_psock_start_verdict(sk,psock);
        }
        write_unlock_bh(&sk->sk_callback_lock);
        return 0;
+out_unlock_drop:
+       write_unlock_bh(&sk->sk_callback_lock);
 out_drop:
        sk_psock_put(sk, psock);
 out_progs:
        if (msg_parser)
                bpf_prog_put(msg_parser);
 out:
-       if (skb_progs) {
+       if (skb_verdict)
                bpf_prog_put(skb_verdict);
+       if (skb_parser)
                bpf_prog_put(skb_parser);
-       }
        return ret;
 }
 
index 56c306e..495dda2 100644 (file)
@@ -548,7 +548,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
        newtp->fastopen_req = NULL;
        RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
 
-       bpf_skops_init_child(sk, newsk);
        tcp_bpf_clone(sk, newsk);
 
        __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
index e63fadd..64c9e55 100644 (file)
@@ -3,9 +3,6 @@
 #include <net/xsk_buff_pool.h>
 #include <net/xdp_sock.h>
 #include <net/xdp_sock_drv.h>
-#include <linux/dma-direct.h>
-#include <linux/dma-noncoherent.h>
-#include <linux/swiotlb.h>
 
 #include "xsk_queue.h"
 #include "xdp_umem.h"
index dc1dd5e..cdb9cf3 100644 (file)
 
 struct xdp_ring {
        u32 producer ____cacheline_aligned_in_smp;
+       /* Hinder the adjacent cache prefetcher to prefetch the consumer
+        * pointer if the producer pointer is touched and vice versa.
+        */
+       u32 pad ____cacheline_aligned_in_smp;
        u32 consumer ____cacheline_aligned_in_smp;
        u32 flags;
 };
index 0c5df59..49da2b8 100644 (file)
@@ -132,7 +132,7 @@ static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
        return 0;
 }
 
-static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 {
        const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
        struct bpf_insn *insn = insn_buf;
index 4f1ed0e..aeebf5d 100644 (file)
@@ -98,8 +98,8 @@ test_map_in_map-objs := test_map_in_map_user.o
 per_socket_stats_example-objs := cookie_uid_helper_example.o
 xdp_redirect-objs := xdp_redirect_user.o
 xdp_redirect_map-objs := xdp_redirect_map_user.o
-xdp_redirect_cpu-objs := bpf_load.o xdp_redirect_cpu_user.o
-xdp_monitor-objs := bpf_load.o xdp_monitor_user.o
+xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o
+xdp_monitor-objs := xdp_monitor_user.o
 xdp_rxq_info-objs := xdp_rxq_info_user.o
 syscall_tp-objs := syscall_tp_user.o
 cpustat-objs := cpustat_user.o
@@ -211,6 +211,8 @@ TPROGLDLIBS_xsk_fwd         += -pthread
 #  make M=samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
 LLC ?= llc
 CLANG ?= clang
+OPT ?= opt
+LLVM_DIS ?= llvm-dis
 LLVM_OBJCOPY ?= llvm-objcopy
 BTF_PAHOLE ?= pahole
 
@@ -303,6 +305,11 @@ $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
 # asm/sysreg.h - inline assembly used by it is incompatible with llvm.
 # But, there is no easy way to fix it, so just exclude it since it is
 # useless for BPF samples.
+# below we use long chain of commands, clang | opt | llvm-dis | llc,
+# to generate final object file. 'clang' compiles the source into IR
+# with native target, e.g., x64, arm64, etc. 'opt' does bpf CORE IR builtin
+# processing (llvm12) and IR optimizations. 'llvm-dis' converts
+# 'opt' output to IR, and finally 'llc' generates bpf byte code.
 $(obj)/%.o: $(src)/%.c
        @echo "  CLANG-bpf " $@
        $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \
@@ -314,7 +321,9 @@ $(obj)/%.o: $(src)/%.c
                -Wno-address-of-packed-member -Wno-tautological-compare \
                -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \
                -I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \
-               -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@
+               -O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \
+               $(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \
+               $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@
 ifeq ($(DWARF2BTF),y)
        $(BTF_PAHOLE) -J $@
 endif
index 4b22ace..ff4c533 100644 (file)
@@ -40,6 +40,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <linux/unistd.h>
+#include <linux/compiler.h>
 
 #include <linux/bpf.h>
 #include <bpf/bpf.h>
@@ -483,7 +484,7 @@ int main(int argc, char **argv)
                                        "Option -%c requires an argument.\n\n",
                                        optopt);
                case 'h':
-                       fallthrough;
+                       __fallthrough;
                default:
                        Usage();
                        return 0;
index 3d33cca..5c955b8 100644 (file)
@@ -6,21 +6,21 @@
 #include <uapi/linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 
-struct bpf_map_def SEC("maps") redirect_err_cnt = {
-       .type = BPF_MAP_TYPE_PERCPU_ARRAY,
-       .key_size = sizeof(u32),
-       .value_size = sizeof(u64),
-       .max_entries = 2,
+struct {
+       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+       __type(key, u32);
+       __type(value, u64);
+       __uint(max_entries, 2);
        /* TODO: have entries for all possible errno's */
-};
+} redirect_err_cnt SEC(".maps");
 
 #define XDP_UNKNOWN    XDP_REDIRECT + 1
-struct bpf_map_def SEC("maps") exception_cnt = {
-       .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
-       .key_size       = sizeof(u32),
-       .value_size     = sizeof(u64),
-       .max_entries    = XDP_UNKNOWN + 1,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+       __type(key, u32);
+       __type(value, u64);
+       __uint(max_entries, XDP_UNKNOWN + 1);
+} exception_cnt SEC(".maps");
 
 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
  * Code in:                kernel/include/trace/events/xdp.h
@@ -129,19 +129,19 @@ struct datarec {
 };
 #define MAX_CPUS 64
 
-struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
-       .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
-       .key_size       = sizeof(u32),
-       .value_size     = sizeof(struct datarec),
-       .max_entries    = MAX_CPUS,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+       __type(key, u32);
+       __type(value, struct datarec);
+       __uint(max_entries, MAX_CPUS);
+} cpumap_enqueue_cnt SEC(".maps");
 
-struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
-       .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
-       .key_size       = sizeof(u32),
-       .value_size     = sizeof(struct datarec),
-       .max_entries    = 1,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+       __type(key, u32);
+       __type(value, struct datarec);
+       __uint(max_entries, 1);
+} cpumap_kthread_cnt SEC(".maps");
 
 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
  * Code in:         kernel/include/trace/events/xdp.h
@@ -210,12 +210,12 @@ int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
        return 0;
 }
 
-struct bpf_map_def SEC("maps") devmap_xmit_cnt = {
-       .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
-       .key_size       = sizeof(u32),
-       .value_size     = sizeof(struct datarec),
-       .max_entries    = 1,
-};
+struct {
+       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+       __type(key, u32);
+       __type(value, struct datarec);
+       __uint(max_entries, 1);
+} devmap_xmit_cnt SEC(".maps");
 
 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format
  * Code in:         kernel/include/trace/events/xdp.h
index ef53b93..03d0a18 100644 (file)
@@ -26,12 +26,37 @@ static const char *__doc_err_only__=
 #include <net/if.h>
 #include <time.h>
 
+#include <signal.h>
 #include <bpf/bpf.h>
-#include "bpf_load.h"
+#include <bpf/libbpf.h>
 #include "bpf_util.h"
 
+enum map_type {
+       REDIRECT_ERR_CNT,
+       EXCEPTION_CNT,
+       CPUMAP_ENQUEUE_CNT,
+       CPUMAP_KTHREAD_CNT,
+       DEVMAP_XMIT_CNT,
+};
+
+static const char *const map_type_strings[] = {
+       [REDIRECT_ERR_CNT] = "redirect_err_cnt",
+       [EXCEPTION_CNT] = "exception_cnt",
+       [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt",
+       [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt",
+       [DEVMAP_XMIT_CNT] = "devmap_xmit_cnt",
+};
+
+#define NUM_MAP 5
+#define NUM_TP 8
+
+static int tp_cnt;
+static int map_cnt;
 static int verbose = 1;
 static bool debug = false;
+struct bpf_map *map_data[NUM_MAP] = {};
+struct bpf_link *tp_links[NUM_TP] = {};
+struct bpf_object *obj;
 
 static const struct option long_options[] = {
        {"help",        no_argument,            NULL, 'h' },
@@ -41,6 +66,16 @@ static const struct option long_options[] = {
        {0, 0, NULL,  0 }
 };
 
+static void int_exit(int sig)
+{
+       /* Detach tracepoints */
+       while (tp_cnt)
+               bpf_link__destroy(tp_links[--tp_cnt]);
+
+       bpf_object__close(obj);
+       exit(0);
+}
+
 /* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */
 #define EXIT_FAIL_MEM  5
 
@@ -483,23 +518,23 @@ static bool stats_collect(struct stats_record *rec)
         * this can happen by someone running perf-record -e
         */
 
-       fd = map_data[0].fd; /* map0: redirect_err_cnt */
+       fd = bpf_map__fd(map_data[REDIRECT_ERR_CNT]);
        for (i = 0; i < REDIR_RES_MAX; i++)
                map_collect_record_u64(fd, i, &rec->xdp_redirect[i]);
 
-       fd = map_data[1].fd; /* map1: exception_cnt */
+       fd = bpf_map__fd(map_data[EXCEPTION_CNT]);
        for (i = 0; i < XDP_ACTION_MAX; i++) {
                map_collect_record_u64(fd, i, &rec->xdp_exception[i]);
        }
 
-       fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */
+       fd = bpf_map__fd(map_data[CPUMAP_ENQUEUE_CNT]);
        for (i = 0; i < MAX_CPUS; i++)
                map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]);
 
-       fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
+       fd = bpf_map__fd(map_data[CPUMAP_KTHREAD_CNT]);
        map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
 
-       fd = map_data[4].fd; /* map4: devmap_xmit_cnt */
+       fd = bpf_map__fd(map_data[DEVMAP_XMIT_CNT]);
        map_collect_record(fd, 0, &rec->xdp_devmap_xmit);
 
        return true;
@@ -598,8 +633,8 @@ static void stats_poll(int interval, bool err_only)
 
        /* TODO Need more advanced stats on error types */
        if (verbose) {
-               printf(" - Stats map0: %s\n", map_data[0].name);
-               printf(" - Stats map1: %s\n", map_data[1].name);
+               printf(" - Stats map0: %s\n", bpf_map__name(map_data[0]));
+               printf(" - Stats map1: %s\n", bpf_map__name(map_data[1]));
                printf("\n");
        }
        fflush(stdout);
@@ -618,44 +653,51 @@ static void stats_poll(int interval, bool err_only)
 
 static void print_bpf_prog_info(void)
 {
-       int i;
+       struct bpf_program *prog;
+       struct bpf_map *map;
+       int i = 0;
 
        /* Prog info */
-       printf("Loaded BPF prog have %d bpf program(s)\n", prog_cnt);
-       for (i = 0; i < prog_cnt; i++) {
-               printf(" - prog_fd[%d] = fd(%d)\n", i, prog_fd[i]);
+       printf("Loaded BPF prog have %d bpf program(s)\n", tp_cnt);
+       bpf_object__for_each_program(prog, obj) {
+               printf(" - prog_fd[%d] = fd(%d)\n", i, bpf_program__fd(prog));
+               i++;
        }
 
+       i = 0;
        /* Maps info */
-       printf("Loaded BPF prog have %d map(s)\n", map_data_count);
-       for (i = 0; i < map_data_count; i++) {
-               char *name = map_data[i].name;
-               int fd     = map_data[i].fd;
+       printf("Loaded BPF prog have %d map(s)\n", map_cnt);
+       bpf_object__for_each_map(map, obj) {
+               const char *name = bpf_map__name(map);
+               int fd           = bpf_map__fd(map);
 
                printf(" - map_data[%d] = fd(%d) name:%s\n", i, fd, name);
+               i++;
        }
 
        /* Event info */
-       printf("Searching for (max:%d) event file descriptor(s)\n", prog_cnt);
-       for (i = 0; i < prog_cnt; i++) {
-               if (event_fd[i] != -1)
-                       printf(" - event_fd[%d] = fd(%d)\n", i, event_fd[i]);
+       printf("Searching for (max:%d) event file descriptor(s)\n", tp_cnt);
+       for (i = 0; i < tp_cnt; i++) {
+               int fd = bpf_link__fd(tp_links[i]);
+
+               if (fd != -1)
+                       printf(" - event_fd[%d] = fd(%d)\n", i, fd);
        }
 }
 
 int main(int argc, char **argv)
 {
        struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+       struct bpf_program *prog;
        int longindex = 0, opt;
-       int ret = EXIT_SUCCESS;
-       char bpf_obj_file[256];
+       int ret = EXIT_FAILURE;
+       enum map_type type;
+       char filename[256];
 
        /* Default settings: */
        bool errors_only = true;
        int interval = 2;
 
-       snprintf(bpf_obj_file, sizeof(bpf_obj_file), "%s_kern.o", argv[0]);
-
        /* Parse commands line args */
        while ((opt = getopt_long(argc, argv, "hDSs:",
                                  long_options, &longindex)) != -1) {
@@ -672,40 +714,79 @@ int main(int argc, char **argv)
                case 'h':
                default:
                        usage(argv);
-                       return EXIT_FAILURE;
+                       return ret;
                }
        }
 
+       snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
        if (setrlimit(RLIMIT_MEMLOCK, &r)) {
                perror("setrlimit(RLIMIT_MEMLOCK)");
-               return EXIT_FAILURE;
+               return ret;
        }
 
-       if (load_bpf_file(bpf_obj_file)) {
-               printf("ERROR - bpf_log_buf: %s", bpf_log_buf);
-               return EXIT_FAILURE;
+       /* Remove tracepoint program when program is interrupted or killed */
+       signal(SIGINT, int_exit);
+       signal(SIGTERM, int_exit);
+
+       obj = bpf_object__open_file(filename, NULL);
+       if (libbpf_get_error(obj)) {
+               printf("ERROR: opening BPF object file failed\n");
+               obj = NULL;
+               goto cleanup;
+       }
+
+       /* load BPF program */
+       if (bpf_object__load(obj)) {
+               printf("ERROR: loading BPF object file failed\n");
+               goto cleanup;
+       }
+
+       for (type = 0; type < NUM_MAP; type++) {
+               map_data[type] =
+                       bpf_object__find_map_by_name(obj, map_type_strings[type]);
+
+               if (libbpf_get_error(map_data[type])) {
+                       printf("ERROR: finding a map in obj file failed\n");
+                       goto cleanup;
+               }
+               map_cnt++;
        }
-       if (!prog_fd[0]) {
-               printf("ERROR - load_bpf_file: %s\n", strerror(errno));
-               return EXIT_FAILURE;
+
+       bpf_object__for_each_program(prog, obj) {
+               tp_links[tp_cnt] = bpf_program__attach(prog);
+               if (libbpf_get_error(tp_links[tp_cnt])) {
+                       printf("ERROR: bpf_program__attach failed\n");
+                       tp_links[tp_cnt] = NULL;
+                       goto cleanup;
+               }
+               tp_cnt++;
        }
 
        if (debug) {
                print_bpf_prog_info();
        }
 
-       /* Unload/stop tracepoint event by closing fd's */
+       /* Unload/stop tracepoint event by closing bpf_link's */
        if (errors_only) {
-               /* The prog_fd[i] and event_fd[i] depend on the
-                * order the functions was defined in _kern.c
+               /* The bpf_link[i] depend on the order of
+                * the functions was defined in _kern.c
                 */
-               close(event_fd[2]); /* tracepoint/xdp/xdp_redirect */
-               close(prog_fd[2]);  /* func: trace_xdp_redirect */
-               close(event_fd[3]); /* tracepoint/xdp/xdp_redirect_map */
-               close(prog_fd[3]);  /* func: trace_xdp_redirect_map */
+               bpf_link__destroy(tp_links[2]); /* tracepoint/xdp/xdp_redirect */
+               tp_links[2] = NULL;
+
+               bpf_link__destroy(tp_links[3]); /* tracepoint/xdp/xdp_redirect_map */
+               tp_links[3] = NULL;
        }
 
        stats_poll(interval, errors_only);
 
+       ret = EXIT_SUCCESS;
+
+cleanup:
+       /* Detach tracepoints */
+       while (tp_cnt)
+               bpf_link__destroy(tp_links[--tp_cnt]);
+
+       bpf_object__close(obj);
        return ret;
 }
index 3dd366e..6fb8dbd 100644 (file)
@@ -37,18 +37,35 @@ static __u32 prog_id;
 
 static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
 static int n_cpus;
-static int cpu_map_fd;
-static int rx_cnt_map_fd;
-static int redirect_err_cnt_map_fd;
-static int cpumap_enqueue_cnt_map_fd;
-static int cpumap_kthread_cnt_map_fd;
-static int cpus_available_map_fd;
-static int cpus_count_map_fd;
-static int cpus_iterator_map_fd;
-static int exception_cnt_map_fd;
+
+enum map_type {
+       CPU_MAP,
+       RX_CNT,
+       REDIRECT_ERR_CNT,
+       CPUMAP_ENQUEUE_CNT,
+       CPUMAP_KTHREAD_CNT,
+       CPUS_AVAILABLE,
+       CPUS_COUNT,
+       CPUS_ITERATOR,
+       EXCEPTION_CNT,
+};
+
+static const char *const map_type_strings[] = {
+       [CPU_MAP] = "cpu_map",
+       [RX_CNT] = "rx_cnt",
+       [REDIRECT_ERR_CNT] = "redirect_err_cnt",
+       [CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt",
+       [CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt",
+       [CPUS_AVAILABLE] = "cpus_available",
+       [CPUS_COUNT] = "cpus_count",
+       [CPUS_ITERATOR] = "cpus_iterator",
+       [EXCEPTION_CNT] = "exception_cnt",
+};
 
 #define NUM_TP 5
-struct bpf_link *tp_links[NUM_TP] = { 0 };
+#define NUM_MAP 9
+struct bpf_link *tp_links[NUM_TP] = {};
+static int map_fds[NUM_MAP];
 static int tp_cnt = 0;
 
 /* Exit return codes */
@@ -527,20 +544,20 @@ static void stats_collect(struct stats_record *rec)
 {
        int fd, i;
 
-       fd = rx_cnt_map_fd;
+       fd = map_fds[RX_CNT];
        map_collect_percpu(fd, 0, &rec->rx_cnt);
 
-       fd = redirect_err_cnt_map_fd;
+       fd = map_fds[REDIRECT_ERR_CNT];
        map_collect_percpu(fd, 1, &rec->redir_err);
 
-       fd = cpumap_enqueue_cnt_map_fd;
+       fd = map_fds[CPUMAP_ENQUEUE_CNT];
        for (i = 0; i < n_cpus; i++)
                map_collect_percpu(fd, i, &rec->enq[i]);
 
-       fd = cpumap_kthread_cnt_map_fd;
+       fd = map_fds[CPUMAP_KTHREAD_CNT];
        map_collect_percpu(fd, 0, &rec->kthread);
 
-       fd = exception_cnt_map_fd;
+       fd = map_fds[EXCEPTION_CNT];
        map_collect_percpu(fd, 0, &rec->exception);
 }
 
@@ -565,7 +582,7 @@ static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value,
        /* Add a CPU entry to cpumap, as this allocate a cpu entry in
         * the kernel for the cpu.
         */
-       ret = bpf_map_update_elem(cpu_map_fd, &cpu, value, 0);
+       ret = bpf_map_update_elem(map_fds[CPU_MAP], &cpu, value, 0);
        if (ret) {
                fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret);
                exit(EXIT_FAIL_BPF);
@@ -574,21 +591,21 @@ static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value,
        /* Inform bpf_prog's that a new CPU is available to select
         * from via some control maps.
         */
-       ret = bpf_map_update_elem(cpus_available_map_fd, &avail_idx, &cpu, 0);
+       ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &avail_idx, &cpu, 0);
        if (ret) {
                fprintf(stderr, "Add to avail CPUs failed\n");
                exit(EXIT_FAIL_BPF);
        }
 
        /* When not replacing/updating existing entry, bump the count */
-       ret = bpf_map_lookup_elem(cpus_count_map_fd, &key, &curr_cpus_count);
+       ret = bpf_map_lookup_elem(map_fds[CPUS_COUNT], &key, &curr_cpus_count);
        if (ret) {
                fprintf(stderr, "Failed reading curr cpus_count\n");
                exit(EXIT_FAIL_BPF);
        }
        if (new) {
                curr_cpus_count++;
-               ret = bpf_map_update_elem(cpus_count_map_fd, &key,
+               ret = bpf_map_update_elem(map_fds[CPUS_COUNT], &key,
                                          &curr_cpus_count, 0);
                if (ret) {
                        fprintf(stderr, "Failed write curr cpus_count\n");
@@ -612,7 +629,7 @@ static void mark_cpus_unavailable(void)
        int ret, i;
 
        for (i = 0; i < n_cpus; i++) {
-               ret = bpf_map_update_elem(cpus_available_map_fd, &i,
+               ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &i,
                                          &invalid_cpu, 0);
                if (ret) {
                        fprintf(stderr, "Failed marking CPU unavailable\n");
@@ -665,68 +682,37 @@ static void stats_poll(int interval, bool use_separators, char *prog_name,
        free_stats_record(prev);
 }
 
-static struct bpf_link * attach_tp(struct bpf_object *obj,
-                                  const char *tp_category,
-                                  const char* tp_name)
+static int init_tracepoints(struct bpf_object *obj)
 {
        struct bpf_program *prog;
-       struct bpf_link *link;
-       char sec_name[PATH_MAX];
-       int len;
 
-       len = snprintf(sec_name, PATH_MAX, "tracepoint/%s/%s",
-                      tp_category, tp_name);
-       if (len < 0)
-               exit(EXIT_FAIL);
+       bpf_object__for_each_program(prog, obj) {
+               if (bpf_program__is_tracepoint(prog) != true)
+                       continue;
 
-       prog = bpf_object__find_program_by_title(obj, sec_name);
-       if (!prog) {
-               fprintf(stderr, "ERR: finding progsec: %s\n", sec_name);
-               exit(EXIT_FAIL_BPF);
+               tp_links[tp_cnt] = bpf_program__attach(prog);
+               if (libbpf_get_error(tp_links[tp_cnt])) {
+                       tp_links[tp_cnt] = NULL;
+                       return -EINVAL;
+               }
+               tp_cnt++;
        }
 
-       link = bpf_program__attach_tracepoint(prog, tp_category, tp_name);
-       if (libbpf_get_error(link))
-               exit(EXIT_FAIL_BPF);
-
-       return link;
-}
-
-static void init_tracepoints(struct bpf_object *obj) {
-       tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_err");
-       tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_map_err");
-       tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_exception");
-       tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_enqueue");
-       tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_kthread");
+       return 0;
 }
 
 static int init_map_fds(struct bpf_object *obj)
 {
-       /* Maps updated by tracepoints */
-       redirect_err_cnt_map_fd =
-               bpf_object__find_map_fd_by_name(obj, "redirect_err_cnt");
-       exception_cnt_map_fd =
-               bpf_object__find_map_fd_by_name(obj, "exception_cnt");
-       cpumap_enqueue_cnt_map_fd =
-               bpf_object__find_map_fd_by_name(obj, "cpumap_enqueue_cnt");
-       cpumap_kthread_cnt_map_fd =
-               bpf_object__find_map_fd_by_name(obj, "cpumap_kthread_cnt");
-
-       /* Maps used by XDP */
-       rx_cnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rx_cnt");
-       cpu_map_fd = bpf_object__find_map_fd_by_name(obj, "cpu_map");
-       cpus_available_map_fd =
-               bpf_object__find_map_fd_by_name(obj, "cpus_available");
-       cpus_count_map_fd = bpf_object__find_map_fd_by_name(obj, "cpus_count");
-       cpus_iterator_map_fd =
-               bpf_object__find_map_fd_by_name(obj, "cpus_iterator");
-
-       if (cpu_map_fd < 0 || rx_cnt_map_fd < 0 ||
-           redirect_err_cnt_map_fd < 0 || cpumap_enqueue_cnt_map_fd < 0 ||
-           cpumap_kthread_cnt_map_fd < 0 || cpus_available_map_fd < 0 ||
-           cpus_count_map_fd < 0 || cpus_iterator_map_fd < 0 ||
-           exception_cnt_map_fd < 0)
-               return -ENOENT;
+       enum map_type type;
+
+       for (type = 0; type < NUM_MAP; type++) {
+               map_fds[type] =
+                       bpf_object__find_map_fd_by_name(obj,
+                                                       map_type_strings[type]);
+
+               if (map_fds[type] < 0)
+                       return -ENOENT;
+       }
 
        return 0;
 }
@@ -795,13 +781,13 @@ int main(int argc, char **argv)
        bool stress_mode = false;
        struct bpf_program *prog;
        struct bpf_object *obj;
+       int err = EXIT_FAIL;
        char filename[256];
        int added_cpus = 0;
        int longindex = 0;
        int interval = 2;
        int add_cpu = -1;
-       int opt, err;
-       int prog_fd;
+       int opt, prog_fd;
        int *cpu, i;
        __u32 qsize;
 
@@ -824,24 +810,29 @@ int main(int argc, char **argv)
        }
 
        if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
-               return EXIT_FAIL;
+               return err;
 
        if (prog_fd < 0) {
                fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n",
                        strerror(errno));
-               return EXIT_FAIL;
+               return err;
        }
-       init_tracepoints(obj);
+
+       if (init_tracepoints(obj) < 0) {
+               fprintf(stderr, "ERR: bpf_program__attach failed\n");
+               return err;
+       }
+
        if (init_map_fds(obj) < 0) {
                fprintf(stderr, "bpf_object__find_map_fd_by_name failed\n");
-               return EXIT_FAIL;
+               return err;
        }
        mark_cpus_unavailable();
 
        cpu = malloc(n_cpus * sizeof(int));
        if (!cpu) {
                fprintf(stderr, "failed to allocate cpu array\n");
-               return EXIT_FAIL;
+               return err;
        }
        memset(cpu, 0, n_cpus * sizeof(int));
 
@@ -960,14 +951,12 @@ int main(int argc, char **argv)
        prog = bpf_object__find_program_by_title(obj, prog_name);
        if (!prog) {
                fprintf(stderr, "bpf_object__find_program_by_title failed\n");
-               err = EXIT_FAIL;
                goto out;
        }
 
        prog_fd = bpf_program__fd(prog);
        if (prog_fd < 0) {
                fprintf(stderr, "bpf_program__fd failed\n");
-               err = EXIT_FAIL;
                goto out;
        }
 
@@ -986,6 +975,8 @@ int main(int argc, char **argv)
 
        stats_poll(interval, use_separators, prog_name, mprog_name,
                   &value, stress_mode);
+
+       err = EXIT_OK;
 out:
        free(cpu);
        return err;
index 3337728..9cf76b3 100644 (file)
@@ -5,14 +5,12 @@
 #include <bpf/bpf_helpers.h>
 
 #define SAMPLE_SIZE 64ul
-#define MAX_CPUS 128
-
-struct bpf_map_def SEC("maps") my_map = {
-       .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
-       .key_size = sizeof(int),
-       .value_size = sizeof(u32),
-       .max_entries = MAX_CPUS,
-};
+
+struct {
+       __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+       __uint(key_size, sizeof(int));
+       __uint(value_size, sizeof(u32));
+} my_map SEC(".maps");
 
 SEC("xdp_sample")
 int xdp_sample_prog(struct xdp_md *ctx)
index 991ef6f..4b2a300 100644 (file)
@@ -18,7 +18,6 @@
 
 #include "perf-sys.h"
 
-#define MAX_CPUS 128
 static int if_idx;
 static char *if_name;
 static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
index b220173..1149e94 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/if_xdp.h>
 #include <linux/if_ether.h>
 #include <linux/ip.h>
+#include <linux/limits.h>
 #include <linux/udp.h>
 #include <arpa/inet.h>
 #include <locale.h>
@@ -79,6 +80,10 @@ static u16 opt_pkt_size = MIN_PKT_SIZE;
 static u32 opt_pkt_fill_pattern = 0x12345678;
 static bool opt_extra_stats;
 static bool opt_quiet;
+static bool opt_app_stats;
+static const char *opt_irq_str = "";
+static u32 irq_no;
+static int irqs_at_init = -1;
 static int opt_poll;
 static int opt_interval = 1;
 static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
@@ -91,18 +96,7 @@ static bool opt_need_wakeup = true;
 static u32 opt_num_xsks = 1;
 static u32 prog_id;
 
-struct xsk_umem_info {
-       struct xsk_ring_prod fq;
-       struct xsk_ring_cons cq;
-       struct xsk_umem *umem;
-       void *buffer;
-};
-
-struct xsk_socket_info {
-       struct xsk_ring_cons rx;
-       struct xsk_ring_prod tx;
-       struct xsk_umem_info *umem;
-       struct xsk_socket *xsk;
+struct xsk_ring_stats {
        unsigned long rx_npkts;
        unsigned long tx_npkts;
        unsigned long rx_dropped_npkts;
@@ -119,6 +113,41 @@ struct xsk_socket_info {
        unsigned long prev_rx_full_npkts;
        unsigned long prev_rx_fill_empty_npkts;
        unsigned long prev_tx_empty_npkts;
+};
+
+struct xsk_driver_stats {
+       unsigned long intrs;
+       unsigned long prev_intrs;
+};
+
+struct xsk_app_stats {
+       unsigned long rx_empty_polls;
+       unsigned long fill_fail_polls;
+       unsigned long copy_tx_sendtos;
+       unsigned long tx_wakeup_sendtos;
+       unsigned long opt_polls;
+       unsigned long prev_rx_empty_polls;
+       unsigned long prev_fill_fail_polls;
+       unsigned long prev_copy_tx_sendtos;
+       unsigned long prev_tx_wakeup_sendtos;
+       unsigned long prev_opt_polls;
+};
+
+struct xsk_umem_info {
+       struct xsk_ring_prod fq;
+       struct xsk_ring_cons cq;
+       struct xsk_umem *umem;
+       void *buffer;
+};
+
+struct xsk_socket_info {
+       struct xsk_ring_cons rx;
+       struct xsk_ring_prod tx;
+       struct xsk_umem_info *umem;
+       struct xsk_socket *xsk;
+       struct xsk_ring_stats ring_stats;
+       struct xsk_app_stats app_stats;
+       struct xsk_driver_stats drv_stats;
        u32 outstanding_tx;
 };
 
@@ -173,18 +202,151 @@ static int xsk_get_xdp_stats(int fd, struct xsk_socket_info *xsk)
                return err;
 
        if (optlen == sizeof(struct xdp_statistics)) {
-               xsk->rx_dropped_npkts = stats.rx_dropped;
-               xsk->rx_invalid_npkts = stats.rx_invalid_descs;
-               xsk->tx_invalid_npkts = stats.tx_invalid_descs;
-               xsk->rx_full_npkts = stats.rx_ring_full;
-               xsk->rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs;
-               xsk->tx_empty_npkts = stats.tx_ring_empty_descs;
+               xsk->ring_stats.rx_dropped_npkts = stats.rx_dropped;
+               xsk->ring_stats.rx_invalid_npkts = stats.rx_invalid_descs;
+               xsk->ring_stats.tx_invalid_npkts = stats.tx_invalid_descs;
+               xsk->ring_stats.rx_full_npkts = stats.rx_ring_full;
+               xsk->ring_stats.rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs;
+               xsk->ring_stats.tx_empty_npkts = stats.tx_ring_empty_descs;
                return 0;
        }
 
        return -EINVAL;
 }
 
+static void dump_app_stats(long dt)
+{
+       int i;
+
+       for (i = 0; i < num_socks && xsks[i]; i++) {
+               char *fmt = "%-18s %'-14.0f %'-14lu\n";
+               double rx_empty_polls_ps, fill_fail_polls_ps, copy_tx_sendtos_ps,
+                               tx_wakeup_sendtos_ps, opt_polls_ps;
+
+               rx_empty_polls_ps = (xsks[i]->app_stats.rx_empty_polls -
+                                       xsks[i]->app_stats.prev_rx_empty_polls) * 1000000000. / dt;
+               fill_fail_polls_ps = (xsks[i]->app_stats.fill_fail_polls -
+                                       xsks[i]->app_stats.prev_fill_fail_polls) * 1000000000. / dt;
+               copy_tx_sendtos_ps = (xsks[i]->app_stats.copy_tx_sendtos -
+                                       xsks[i]->app_stats.prev_copy_tx_sendtos) * 1000000000. / dt;
+               tx_wakeup_sendtos_ps = (xsks[i]->app_stats.tx_wakeup_sendtos -
+                                       xsks[i]->app_stats.prev_tx_wakeup_sendtos)
+                                                                               * 1000000000. / dt;
+               opt_polls_ps = (xsks[i]->app_stats.opt_polls -
+                                       xsks[i]->app_stats.prev_opt_polls) * 1000000000. / dt;
+
+               printf("\n%-18s %-14s %-14s\n", "", "calls/s", "count");
+               printf(fmt, "rx empty polls", rx_empty_polls_ps, xsks[i]->app_stats.rx_empty_polls);
+               printf(fmt, "fill fail polls", fill_fail_polls_ps,
+                                                       xsks[i]->app_stats.fill_fail_polls);
+               printf(fmt, "copy tx sendtos", copy_tx_sendtos_ps,
+                                                       xsks[i]->app_stats.copy_tx_sendtos);
+               printf(fmt, "tx wakeup sendtos", tx_wakeup_sendtos_ps,
+                                                       xsks[i]->app_stats.tx_wakeup_sendtos);
+               printf(fmt, "opt polls", opt_polls_ps, xsks[i]->app_stats.opt_polls);
+
+               xsks[i]->app_stats.prev_rx_empty_polls = xsks[i]->app_stats.rx_empty_polls;
+               xsks[i]->app_stats.prev_fill_fail_polls = xsks[i]->app_stats.fill_fail_polls;
+               xsks[i]->app_stats.prev_copy_tx_sendtos = xsks[i]->app_stats.copy_tx_sendtos;
+               xsks[i]->app_stats.prev_tx_wakeup_sendtos = xsks[i]->app_stats.tx_wakeup_sendtos;
+               xsks[i]->app_stats.prev_opt_polls = xsks[i]->app_stats.opt_polls;
+       }
+}
+
+static bool get_interrupt_number(void)
+{
+       FILE *f_int_proc;
+       char line[4096];
+       bool found = false;
+
+       f_int_proc = fopen("/proc/interrupts", "r");
+       if (f_int_proc == NULL) {
+               printf("Failed to open /proc/interrupts.\n");
+               return found;
+       }
+
+       while (!feof(f_int_proc) && !found) {
+               /* Make sure to read a full line at a time */
+               if (fgets(line, sizeof(line), f_int_proc) == NULL ||
+                               line[strlen(line) - 1] != '\n') {
+                       printf("Error reading from interrupts file\n");
+                       break;
+               }
+
+               /* Extract interrupt number from line */
+               if (strstr(line, opt_irq_str) != NULL) {
+                       irq_no = atoi(line);
+                       found = true;
+                       break;
+               }
+       }
+
+       fclose(f_int_proc);
+
+       return found;
+}
+
+static int get_irqs(void)
+{
+       char count_path[PATH_MAX];
+       int total_intrs = -1;
+       FILE *f_count_proc;
+       char line[4096];
+
+       snprintf(count_path, sizeof(count_path),
+               "/sys/kernel/irq/%i/per_cpu_count", irq_no);
+       f_count_proc = fopen(count_path, "r");
+       if (f_count_proc == NULL) {
+               printf("Failed to open %s\n", count_path);
+               return total_intrs;
+       }
+
+       if (fgets(line, sizeof(line), f_count_proc) == NULL ||
+                       line[strlen(line) - 1] != '\n') {
+               printf("Error reading from %s\n", count_path);
+       } else {
+               static const char com[2] = ",";
+               char *token;
+
+               total_intrs = 0;
+               token = strtok(line, com);
+               while (token != NULL) {
+                       /* sum up interrupts across all cores */
+                       total_intrs += atoi(token);
+                       token = strtok(NULL, com);
+               }
+       }
+
+       fclose(f_count_proc);
+
+       return total_intrs;
+}
+
+static void dump_driver_stats(long dt)
+{
+       int i;
+
+       for (i = 0; i < num_socks && xsks[i]; i++) {
+               char *fmt = "%-18s %'-14.0f %'-14lu\n";
+               double intrs_ps;
+               int n_ints = get_irqs();
+
+               if (n_ints < 0) {
+                       printf("error getting intr info for intr %i\n", irq_no);
+                       return;
+               }
+               xsks[i]->drv_stats.intrs = n_ints - irqs_at_init;
+
+               intrs_ps = (xsks[i]->drv_stats.intrs - xsks[i]->drv_stats.prev_intrs) *
+                        1000000000. / dt;
+
+               printf("\n%-18s %-14s %-14s\n", "", "intrs/s", "count");
+               printf(fmt, "irqs", intrs_ps, xsks[i]->drv_stats.intrs);
+
+               xsks[i]->drv_stats.prev_intrs = xsks[i]->drv_stats.intrs;
+       }
+}
+
 static void dump_stats(void)
 {
        unsigned long now = get_nsecs();
@@ -194,67 +356,83 @@ static void dump_stats(void)
        prev_time = now;
 
        for (i = 0; i < num_socks && xsks[i]; i++) {
-               char *fmt = "%-15s %'-11.0f %'-11lu\n";
+               char *fmt = "%-18s %'-14.0f %'-14lu\n";
                double rx_pps, tx_pps, dropped_pps, rx_invalid_pps, full_pps, fill_empty_pps,
                        tx_invalid_pps, tx_empty_pps;
 
-               rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) *
+               rx_pps = (xsks[i]->ring_stats.rx_npkts - xsks[i]->ring_stats.prev_rx_npkts) *
                         1000000000. / dt;
-               tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) *
+               tx_pps = (xsks[i]->ring_stats.tx_npkts - xsks[i]->ring_stats.prev_tx_npkts) *
                         1000000000. / dt;
 
                printf("\n sock%d@", i);
                print_benchmark(false);
                printf("\n");
 
-               printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts",
+               printf("%-18s %-14s %-14s %-14.2f\n", "", "pps", "pkts",
                       dt / 1000000000.);
-               printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts);
-               printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts);
+               printf(fmt, "rx", rx_pps, xsks[i]->ring_stats.rx_npkts);
+               printf(fmt, "tx", tx_pps, xsks[i]->ring_stats.tx_npkts);
 
-               xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts;
-               xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts;
+               xsks[i]->ring_stats.prev_rx_npkts = xsks[i]->ring_stats.rx_npkts;
+               xsks[i]->ring_stats.prev_tx_npkts = xsks[i]->ring_stats.tx_npkts;
 
                if (opt_extra_stats) {
                        if (!xsk_get_xdp_stats(xsk_socket__fd(xsks[i]->xsk), xsks[i])) {
-                               dropped_pps = (xsks[i]->rx_dropped_npkts -
-                                               xsks[i]->prev_rx_dropped_npkts) * 1000000000. / dt;
-                               rx_invalid_pps = (xsks[i]->rx_invalid_npkts -
-                                               xsks[i]->prev_rx_invalid_npkts) * 1000000000. / dt;
-                               tx_invalid_pps = (xsks[i]->tx_invalid_npkts -
-                                               xsks[i]->prev_tx_invalid_npkts) * 1000000000. / dt;
-                               full_pps = (xsks[i]->rx_full_npkts -
-                                               xsks[i]->prev_rx_full_npkts) * 1000000000. / dt;
-                               fill_empty_pps = (xsks[i]->rx_fill_empty_npkts -
-                                               xsks[i]->prev_rx_fill_empty_npkts)
-                                               * 1000000000. / dt;
-                               tx_empty_pps = (xsks[i]->tx_empty_npkts -
-                                               xsks[i]->prev_tx_empty_npkts) * 1000000000. / dt;
+                               dropped_pps = (xsks[i]->ring_stats.rx_dropped_npkts -
+                                               xsks[i]->ring_stats.prev_rx_dropped_npkts) *
+                                                       1000000000. / dt;
+                               rx_invalid_pps = (xsks[i]->ring_stats.rx_invalid_npkts -
+                                               xsks[i]->ring_stats.prev_rx_invalid_npkts) *
+                                                       1000000000. / dt;
+                               tx_invalid_pps = (xsks[i]->ring_stats.tx_invalid_npkts -
+                                               xsks[i]->ring_stats.prev_tx_invalid_npkts) *
+                                                       1000000000. / dt;
+                               full_pps = (xsks[i]->ring_stats.rx_full_npkts -
+                                               xsks[i]->ring_stats.prev_rx_full_npkts) *
+                                                       1000000000. / dt;
+                               fill_empty_pps = (xsks[i]->ring_stats.rx_fill_empty_npkts -
+                                               xsks[i]->ring_stats.prev_rx_fill_empty_npkts) *
+                                                       1000000000. / dt;
+                               tx_empty_pps = (xsks[i]->ring_stats.tx_empty_npkts -
+                                               xsks[i]->ring_stats.prev_tx_empty_npkts) *
+                                                       1000000000. / dt;
 
                                printf(fmt, "rx dropped", dropped_pps,
-                                      xsks[i]->rx_dropped_npkts);
+                                      xsks[i]->ring_stats.rx_dropped_npkts);
                                printf(fmt, "rx invalid", rx_invalid_pps,
-                                      xsks[i]->rx_invalid_npkts);
+                                      xsks[i]->ring_stats.rx_invalid_npkts);
                                printf(fmt, "tx invalid", tx_invalid_pps,
-                                      xsks[i]->tx_invalid_npkts);
+                                      xsks[i]->ring_stats.tx_invalid_npkts);
                                printf(fmt, "rx queue full", full_pps,
-                                      xsks[i]->rx_full_npkts);
+                                      xsks[i]->ring_stats.rx_full_npkts);
                                printf(fmt, "fill ring empty", fill_empty_pps,
-                                      xsks[i]->rx_fill_empty_npkts);
+                                      xsks[i]->ring_stats.rx_fill_empty_npkts);
                                printf(fmt, "tx ring empty", tx_empty_pps,
-                                      xsks[i]->tx_empty_npkts);
-
-                               xsks[i]->prev_rx_dropped_npkts = xsks[i]->rx_dropped_npkts;
-                               xsks[i]->prev_rx_invalid_npkts = xsks[i]->rx_invalid_npkts;
-                               xsks[i]->prev_tx_invalid_npkts = xsks[i]->tx_invalid_npkts;
-                               xsks[i]->prev_rx_full_npkts = xsks[i]->rx_full_npkts;
-                               xsks[i]->prev_rx_fill_empty_npkts = xsks[i]->rx_fill_empty_npkts;
-                               xsks[i]->prev_tx_empty_npkts = xsks[i]->tx_empty_npkts;
+                                      xsks[i]->ring_stats.tx_empty_npkts);
+
+                               xsks[i]->ring_stats.prev_rx_dropped_npkts =
+                                       xsks[i]->ring_stats.rx_dropped_npkts;
+                               xsks[i]->ring_stats.prev_rx_invalid_npkts =
+                                       xsks[i]->ring_stats.rx_invalid_npkts;
+                               xsks[i]->ring_stats.prev_tx_invalid_npkts =
+                                       xsks[i]->ring_stats.tx_invalid_npkts;
+                               xsks[i]->ring_stats.prev_rx_full_npkts =
+                                       xsks[i]->ring_stats.rx_full_npkts;
+                               xsks[i]->ring_stats.prev_rx_fill_empty_npkts =
+                                       xsks[i]->ring_stats.rx_fill_empty_npkts;
+                               xsks[i]->ring_stats.prev_tx_empty_npkts =
+                                       xsks[i]->ring_stats.tx_empty_npkts;
                        } else {
                                printf("%-15s\n", "Error retrieving extra stats");
                        }
                }
        }
+
+       if (opt_app_stats)
+               dump_app_stats(dt);
+       if (irq_no)
+               dump_driver_stats(dt);
 }
 
 static bool is_benchmark_done(void)
@@ -693,6 +871,17 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem,
        if (ret)
                exit_with_error(-ret);
 
+       xsk->app_stats.rx_empty_polls = 0;
+       xsk->app_stats.fill_fail_polls = 0;
+       xsk->app_stats.copy_tx_sendtos = 0;
+       xsk->app_stats.tx_wakeup_sendtos = 0;
+       xsk->app_stats.opt_polls = 0;
+       xsk->app_stats.prev_rx_empty_polls = 0;
+       xsk->app_stats.prev_fill_fail_polls = 0;
+       xsk->app_stats.prev_copy_tx_sendtos = 0;
+       xsk->app_stats.prev_tx_wakeup_sendtos = 0;
+       xsk->app_stats.prev_opt_polls = 0;
+
        return xsk;
 }
 
@@ -720,6 +909,8 @@ static struct option long_options[] = {
        {"tx-pkt-pattern", required_argument, 0, 'P'},
        {"extra-stats", no_argument, 0, 'x'},
        {"quiet", no_argument, 0, 'Q'},
+       {"app-stats", no_argument, 0, 'a'},
+       {"irq-string", no_argument, 0, 'I'},
        {0, 0, 0, 0}
 };
 
@@ -756,6 +947,8 @@ static void usage(const char *prog)
                "  -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n"
                "  -x, --extra-stats    Display extra statistics.\n"
                "  -Q, --quiet          Do not display any stats.\n"
+               "  -a, --app-stats      Display application (syscall) statistics.\n"
+               "  -I, --irq-string     Display driver interrupt statistics for interface associated with irq-string.\n"
                "\n";
        fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE,
                opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE,
@@ -771,7 +964,7 @@ static void parse_command_line(int argc, char **argv)
        opterr = 0;
 
        for (;;) {
-               c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQ",
+               c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:",
                                long_options, &option_index);
                if (c == -1)
                        break;
@@ -857,6 +1050,19 @@ static void parse_command_line(int argc, char **argv)
                        break;
                case 'Q':
                        opt_quiet = 1;
+                       break;
+               case 'a':
+                       opt_app_stats = 1;
+                       break;
+               case 'I':
+                       opt_irq_str = optarg;
+                       if (get_interrupt_number())
+                               irqs_at_init = get_irqs();
+                       if (irqs_at_init < 0) {
+                               fprintf(stderr, "ERROR: Failed to get irqs for %s\n", opt_irq_str);
+                               usage(basename(argv[0]));
+                       }
+
                        break;
                default:
                        usage(basename(argv[0]));
@@ -908,8 +1114,10 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
         * is driven by the NAPI loop. So as an optimization, we do not have to call
         * sendto() all the time in zero-copy mode for l2fwd.
         */
-       if (opt_xdp_bind_flags & XDP_COPY)
+       if (opt_xdp_bind_flags & XDP_COPY) {
+               xsk->app_stats.copy_tx_sendtos++;
                kick_tx(xsk);
+       }
 
        ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size :
                xsk->outstanding_tx;
@@ -924,8 +1132,10 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
                while (ret != rcvd) {
                        if (ret < 0)
                                exit_with_error(-ret);
-                       if (xsk_ring_prod__needs_wakeup(&umem->fq))
+                       if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
+                               xsk->app_stats.fill_fail_polls++;
                                ret = poll(fds, num_socks, opt_timeout);
+                       }
                        ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
                }
 
@@ -936,7 +1146,7 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
                xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
                xsk_ring_cons__release(&xsk->umem->cq, rcvd);
                xsk->outstanding_tx -= rcvd;
-               xsk->tx_npkts += rcvd;
+               xsk->ring_stats.tx_npkts += rcvd;
        }
 }
 
@@ -949,14 +1159,16 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk,
        if (!xsk->outstanding_tx)
                return;
 
-       if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
+       if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) {
+               xsk->app_stats.tx_wakeup_sendtos++;
                kick_tx(xsk);
+       }
 
        rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx);
        if (rcvd > 0) {
                xsk_ring_cons__release(&xsk->umem->cq, rcvd);
                xsk->outstanding_tx -= rcvd;
-               xsk->tx_npkts += rcvd;
+               xsk->ring_stats.tx_npkts += rcvd;
        }
 }
 
@@ -968,8 +1180,10 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
 
        rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx);
        if (!rcvd) {
-               if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+               if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
+                       xsk->app_stats.rx_empty_polls++;
                        ret = poll(fds, num_socks, opt_timeout);
+               }
                return;
        }
 
@@ -977,8 +1191,10 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
        while (ret != rcvd) {
                if (ret < 0)
                        exit_with_error(-ret);
-               if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+               if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
+                       xsk->app_stats.fill_fail_polls++;
                        ret = poll(fds, num_socks, opt_timeout);
+               }
                ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
        }
 
@@ -996,7 +1212,7 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
 
        xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
        xsk_ring_cons__release(&xsk->rx, rcvd);
-       xsk->rx_npkts += rcvd;
+       xsk->ring_stats.rx_npkts += rcvd;
 }
 
 static void rx_drop_all(void)
@@ -1011,6 +1227,8 @@ static void rx_drop_all(void)
 
        for (;;) {
                if (opt_poll) {
+                       for (i = 0; i < num_socks; i++)
+                               xsks[i]->app_stats.opt_polls++;
                        ret = poll(fds, num_socks, opt_timeout);
                        if (ret <= 0)
                                continue;
@@ -1091,6 +1309,8 @@ static void tx_only_all(void)
                int batch_size = get_batch_size(pkt_cnt);
 
                if (opt_poll) {
+                       for (i = 0; i < num_socks; i++)
+                               xsks[i]->app_stats.opt_polls++;
                        ret = poll(fds, num_socks, opt_timeout);
                        if (ret <= 0)
                                continue;
@@ -1122,8 +1342,10 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
 
        rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx);
        if (!rcvd) {
-               if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+               if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
+                       xsk->app_stats.rx_empty_polls++;
                        ret = poll(fds, num_socks, opt_timeout);
+               }
                return;
        }
 
@@ -1132,8 +1354,10 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
                if (ret < 0)
                        exit_with_error(-ret);
                complete_tx_l2fwd(xsk, fds);
-               if (xsk_ring_prod__needs_wakeup(&xsk->tx))
+               if (xsk_ring_prod__needs_wakeup(&xsk->tx)) {
+                       xsk->app_stats.tx_wakeup_sendtos++;
                        kick_tx(xsk);
+               }
                ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
        }
 
@@ -1155,7 +1379,7 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
        xsk_ring_prod__submit(&xsk->tx, rcvd);
        xsk_ring_cons__release(&xsk->rx, rcvd);
 
-       xsk->rx_npkts += rcvd;
+       xsk->ring_stats.rx_npkts += rcvd;
        xsk->outstanding_tx += rcvd;
 }
 
@@ -1171,6 +1395,8 @@ static void l2fwd_all(void)
 
        for (;;) {
                if (opt_poll) {
+                       for (i = 0; i < num_socks; i++)
+                               xsks[i]->app_stats.opt_polls++;
                        ret = poll(fds, num_socks, opt_timeout);
                        if (ret <= 0)
                                continue;
index 4f556cf..bf5a99d 100644 (file)
@@ -356,18 +356,36 @@ enum bpf_link_type {
 #define BPF_F_SLEEPABLE                (1U << 4)
 
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
- * two extensions:
- *
- * insn[0].src_reg:  BPF_PSEUDO_MAP_FD   BPF_PSEUDO_MAP_VALUE
- * insn[0].imm:      map fd              map fd
- * insn[1].imm:      0                   offset into value
- * insn[0].off:      0                   0
- * insn[1].off:      0                   0
- * ldimm64 rewrite:  address of map      address of map[0]+offset
- * verifier type:    CONST_PTR_TO_MAP    PTR_TO_MAP_VALUE
+ * the following extensions:
+ *
+ * insn[0].src_reg:  BPF_PSEUDO_MAP_FD
+ * insn[0].imm:      map fd
+ * insn[1].imm:      0
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of map
+ * verifier type:    CONST_PTR_TO_MAP
  */
 #define BPF_PSEUDO_MAP_FD      1
+/* insn[0].src_reg:  BPF_PSEUDO_MAP_VALUE
+ * insn[0].imm:      map fd
+ * insn[1].imm:      offset into value
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of map[0]+offset
+ * verifier type:    PTR_TO_MAP_VALUE
+ */
 #define BPF_PSEUDO_MAP_VALUE   2
+/* insn[0].src_reg:  BPF_PSEUDO_BTF_ID
+ * insn[0].imm:      kernel btd id of VAR
+ * insn[1].imm:      0
+ * insn[0].off:      0
+ * insn[1].off:      0
+ * ldimm64 rewrite:  address of the kernel variable
+ * verifier type:    PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var
+ *                   is struct/union.
+ */
+#define BPF_PSEUDO_BTF_ID      3
 
 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
  * offset to another bpf function
@@ -417,6 +435,9 @@ enum {
 
 /* Share perf_event among processes */
        BPF_F_PRESERVE_ELEMS    = (1U << 11),
+
+/* Create a map that is suitable to be an inner map with dynamic max entries */
+       BPF_F_INNER_MAP         = (1U << 12),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1680,7 +1701,7 @@ union bpf_attr {
  *               **TCP_CONGESTION**, **TCP_BPF_IW**,
  *               **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**,
  *               **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**,
- *               **TCP_SYNCNT**, **TCP_USER_TIMEOUT**.
+ *               **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**.
  *             * **IPPROTO_IP**, which supports *optname* **IP_TOS**.
  *             * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
  *     Return
@@ -2235,7 +2256,7 @@ union bpf_attr {
  *     Description
  *             This helper is used in programs implementing policies at the
  *             skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
- *             if the verdeict eBPF program returns **SK_PASS**), redirect it
+ *             if the verdict eBPF program returns **SK_PASS**), redirect it
  *             to the socket referenced by *map* (of type
  *             **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
  *             egress interfaces can be used for redirection. The
@@ -3661,10 +3682,59 @@ union bpf_attr {
  *             Redirect the packet to another net device of index *ifindex*
  *             and fill in L2 addresses from neighboring subsystem. This helper
  *             is somewhat similar to **bpf_redirect**\ (), except that it
- *             fills in e.g. MAC addresses based on the L3 information from
- *             the packet. This helper is supported for IPv4 and IPv6 protocols.
+ *             populates L2 addresses as well, meaning, internally, the helper
+ *             performs a FIB lookup based on the skb's networking header to
+ *             get the address of the next hop and then relies on the neighbor
+ *             lookup for the L2 address of the nexthop.
+ *
+ *             The *flags* argument is reserved and must be 0. The helper is
+ *             currently only supported for tc BPF program types, and enabled
+ *             for IPv4 and IPv6 protocols.
+ *     Return
+ *             The helper returns **TC_ACT_REDIRECT** on success or
+ *             **TC_ACT_SHOT** on error.
+ *
+ * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu)
+ *     Description
+ *             Take a pointer to a percpu ksym, *percpu_ptr*, and return a
+ *             pointer to the percpu kernel variable on *cpu*. A ksym is an
+ *             extern variable decorated with '__ksym'. For ksym, there is a
+ *             global var (either static or global) defined of the same name
+ *             in the kernel. The ksym is percpu if the global var is percpu.
+ *             The returned pointer points to the global percpu var on *cpu*.
+ *
+ *             bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the
+ *             kernel, except that bpf_per_cpu_ptr() may return NULL. This
+ *             happens if *cpu* is larger than nr_cpu_ids. The caller of
+ *             bpf_per_cpu_ptr() must check the returned value.
+ *     Return
+ *             A pointer pointing to the kernel percpu variable on *cpu*, or
+ *             NULL, if *cpu* is invalid.
+ *
+ * void *bpf_this_cpu_ptr(const void *percpu_ptr)
+ *     Description
+ *             Take a pointer to a percpu ksym, *percpu_ptr*, and return a
+ *             pointer to the percpu kernel variable on this cpu. See the
+ *             description of 'ksym' in **bpf_per_cpu_ptr**\ ().
+ *
+ *             bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in
+ *             the kernel. Different from **bpf_per_cpu_ptr**\ (), it would
+ *             never return NULL.
+ *     Return
+ *             A pointer pointing to the kernel percpu variable on this cpu.
+ *
+ * long bpf_redirect_peer(u32 ifindex, u64 flags)
+ *     Description
+ *             Redirect the packet to another net device of index *ifindex*.
+ *             This helper is somewhat similar to **bpf_redirect**\ (), except
+ *             that the redirection happens to the *ifindex*' peer device and
+ *             the netns switch takes place from ingress to ingress without
+ *             going through the CPU's backlog queue.
+ *
  *             The *flags* argument is reserved and must be 0. The helper is
- *             currently only supported for tc BPF program types.
+ *             currently only supported for tc BPF program types at the ingress
+ *             hook and for veth device types. The peer device must reside in a
+ *             different network namespace.
  *     Return
  *             The helper returns **TC_ACT_REDIRECT** on success or
  *             **TC_ACT_SHOT** on error.
@@ -3823,6 +3893,9 @@ union bpf_attr {
        FN(seq_printf_btf),             \
        FN(skb_cgroup_classid),         \
        FN(redirect_neigh),             \
+       FN(bpf_per_cpu_ptr),            \
+       FN(bpf_this_cpu_ptr),           \
+       FN(redirect_peer),              \
        /* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
index 3df1f4d..3130341 100644 (file)
@@ -390,6 +390,12 @@ struct extern_desc {
                } kcfg;
                struct {
                        unsigned long long addr;
+
+                       /* target btf_id of the corresponding kernel var. */
+                       int vmlinux_btf_id;
+
+                       /* local btf_id of the ksym extern's type. */
+                       __u32 type_id;
                } ksym;
        };
 };
@@ -2522,12 +2528,23 @@ static int bpf_object__load_vmlinux_btf(struct bpf_object *obj)
 {
        bool need_vmlinux_btf = false;
        struct bpf_program *prog;
-       int err;
+       int i, err;
 
        /* CO-RE relocations need kernel BTF */
        if (obj->btf_ext && obj->btf_ext->core_relo_info.len)
                need_vmlinux_btf = true;
 
+       /* Support for typed ksyms needs kernel BTF */
+       for (i = 0; i < obj->nr_extern; i++) {
+               const struct extern_desc *ext;
+
+               ext = &obj->externs[i];
+               if (ext->type == EXT_KSYM && ext->ksym.type_id) {
+                       need_vmlinux_btf = true;
+                       break;
+               }
+       }
+
        bpf_object__for_each_program(prog, obj) {
                if (!prog->load)
                        continue;
@@ -3156,16 +3173,10 @@ static int bpf_object__collect_externs(struct bpf_object *obj)
                                return -ENOTSUP;
                        }
                } else if (strcmp(sec_name, KSYMS_SEC) == 0) {
-                       const struct btf_type *vt;
-
                        ksym_sec = sec;
                        ext->type = EXT_KSYM;
-
-                       vt = skip_mods_and_typedefs(obj->btf, t->type, NULL);
-                       if (!btf_is_void(vt)) {
-                               pr_warn("extern (ksym) '%s' is not typeless (void)\n", ext_name);
-                               return -ENOTSUP;
-                       }
+                       skip_mods_and_typedefs(obj->btf, t->type,
+                                              &ext->ksym.type_id);
                } else {
                        pr_warn("unrecognized extern section '%s'\n", sec_name);
                        return -ENOTSUP;
@@ -4192,6 +4203,36 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map)
        return 0;
 }
 
+static int init_map_slots(struct bpf_map *map)
+{
+       const struct bpf_map *targ_map;
+       unsigned int i;
+       int fd, err;
+
+       for (i = 0; i < map->init_slots_sz; i++) {
+               if (!map->init_slots[i])
+                       continue;
+
+               targ_map = map->init_slots[i];
+               fd = bpf_map__fd(targ_map);
+               err = bpf_map_update_elem(map->fd, &i, &fd, 0);
+               if (err) {
+                       err = -errno;
+                       pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n",
+                               map->name, i, targ_map->name,
+                               fd, err);
+                       return err;
+               }
+               pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n",
+                        map->name, i, targ_map->name, fd);
+       }
+
+       zfree(&map->init_slots);
+       map->init_slots_sz = 0;
+
+       return 0;
+}
+
 static int
 bpf_object__create_maps(struct bpf_object *obj)
 {
@@ -4215,47 +4256,29 @@ bpf_object__create_maps(struct bpf_object *obj)
                if (map->fd >= 0) {
                        pr_debug("map '%s': skipping creation (preset fd=%d)\n",
                                 map->name, map->fd);
-                       continue;
-               }
-
-               err = bpf_object__create_map(obj, map);
-               if (err)
-                       goto err_out;
-
-               pr_debug("map '%s': created successfully, fd=%d\n", map->name,
-                        map->fd);
-
-               if (bpf_map__is_internal(map)) {
-                       err = bpf_object__populate_internal_map(obj, map);
-                       if (err < 0) {
-                               zclose(map->fd);
+               } else {
+                       err = bpf_object__create_map(obj, map);
+                       if (err)
                                goto err_out;
-                       }
-               }
 
-               if (map->init_slots_sz) {
-                       for (j = 0; j < map->init_slots_sz; j++) {
-                               const struct bpf_map *targ_map;
-                               int fd;
+                       pr_debug("map '%s': created successfully, fd=%d\n",
+                                map->name, map->fd);
 
-                               if (!map->init_slots[j])
-                                       continue;
+                       if (bpf_map__is_internal(map)) {
+                               err = bpf_object__populate_internal_map(obj, map);
+                               if (err < 0) {
+                                       zclose(map->fd);
+                                       goto err_out;
+                               }
+                       }
 
-                               targ_map = map->init_slots[j];
-                               fd = bpf_map__fd(targ_map);
-                               err = bpf_map_update_elem(map->fd, &j, &fd, 0);
-                               if (err) {
-                                       err = -errno;
-                                       pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n",
-                                               map->name, j, targ_map->name,
-                                               fd, err);
+                       if (map->init_slots_sz) {
+                               err = init_map_slots(map);
+                               if (err < 0) {
+                                       zclose(map->fd);
                                        goto err_out;
                                }
-                               pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n",
-                                        map->name, j, targ_map->name, fd);
                        }
-                       zfree(&map->init_slots);
-                       map->init_slots_sz = 0;
                }
 
                if (map->pin_path && !map->pinned) {
@@ -5017,16 +5040,19 @@ static int bpf_core_spec_match(struct bpf_core_spec *local_spec,
 static int bpf_core_calc_field_relo(const struct bpf_program *prog,
                                    const struct bpf_core_relo *relo,
                                    const struct bpf_core_spec *spec,
-                                   __u32 *val, bool *validate)
+                                   __u32 *val, __u32 *field_sz, __u32 *type_id,
+                                   bool *validate)
 {
        const struct bpf_core_accessor *acc;
        const struct btf_type *t;
-       __u32 byte_off, byte_sz, bit_off, bit_sz;
+       __u32 byte_off, byte_sz, bit_off, bit_sz, field_type_id;
        const struct btf_member *m;
        const struct btf_type *mt;
        bool bitfield;
        __s64 sz;
 
+       *field_sz = 0;
+
        if (relo->kind == BPF_FIELD_EXISTS) {
                *val = spec ? 1 : 0;
                return 0;
@@ -5042,6 +5068,12 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog,
        if (!acc->name) {
                if (relo->kind == BPF_FIELD_BYTE_OFFSET) {
                        *val = spec->bit_offset / 8;
+                       /* remember field size for load/store mem size */
+                       sz = btf__resolve_size(spec->btf, acc->type_id);
+                       if (sz < 0)
+                               return -EINVAL;
+                       *field_sz = sz;
+                       *type_id = acc->type_id;
                } else if (relo->kind == BPF_FIELD_BYTE_SIZE) {
                        sz = btf__resolve_size(spec->btf, acc->type_id);
                        if (sz < 0)
@@ -5058,7 +5090,7 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog,
        }
 
        m = btf_members(t) + acc->idx;
-       mt = skip_mods_and_typedefs(spec->btf, m->type, NULL);
+       mt = skip_mods_and_typedefs(spec->btf, m->type, &field_type_id);
        bit_off = spec->bit_offset;
        bit_sz = btf_member_bitfield_size(t, acc->idx);
 
@@ -5078,7 +5110,7 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog,
                        byte_off = bit_off / 8 / byte_sz * byte_sz;
                }
        } else {
-               sz = btf__resolve_size(spec->btf, m->type);
+               sz = btf__resolve_size(spec->btf, field_type_id);
                if (sz < 0)
                        return -EINVAL;
                byte_sz = sz;
@@ -5096,6 +5128,10 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog,
        switch (relo->kind) {
        case BPF_FIELD_BYTE_OFFSET:
                *val = byte_off;
+               if (!bitfield) {
+                       *field_sz = byte_sz;
+                       *type_id = field_type_id;
+               }
                break;
        case BPF_FIELD_BYTE_SIZE:
                *val = byte_sz;
@@ -5196,6 +5232,19 @@ struct bpf_core_relo_res
        bool poison;
        /* some relocations can't be validated against orig_val */
        bool validate;
+       /* for field byte offset relocations or the forms:
+        *     *(T *)(rX + <off>) = rY
+        *     rX = *(T *)(rY + <off>),
+        * we remember original and resolved field size to adjust direct
+        * memory loads of pointers and integers; this is necessary for 32-bit
+        * host kernel architectures, but also allows to automatically
+        * relocate fields that were resized from, e.g., u32 to u64, etc.
+        */
+       bool fail_memsz_adjust;
+       __u32 orig_sz;
+       __u32 orig_type_id;
+       __u32 new_sz;
+       __u32 new_type_id;
 };
 
 /* Calculate original and target relocation values, given local and target
@@ -5217,10 +5266,56 @@ static int bpf_core_calc_relo(const struct bpf_program *prog,
        res->new_val = 0;
        res->poison = false;
        res->validate = true;
+       res->fail_memsz_adjust = false;
+       res->orig_sz = res->new_sz = 0;
+       res->orig_type_id = res->new_type_id = 0;
 
        if (core_relo_is_field_based(relo->kind)) {
-               err = bpf_core_calc_field_relo(prog, relo, local_spec, &res->orig_val, &res->validate);
-               err = err ?: bpf_core_calc_field_relo(prog, relo, targ_spec, &res->new_val, NULL);
+               err = bpf_core_calc_field_relo(prog, relo, local_spec,
+                                              &res->orig_val, &res->orig_sz,
+                                              &res->orig_type_id, &res->validate);
+               err = err ?: bpf_core_calc_field_relo(prog, relo, targ_spec,
+                                                     &res->new_val, &res->new_sz,
+                                                     &res->new_type_id, NULL);
+               if (err)
+                       goto done;
+               /* Validate if it's safe to adjust load/store memory size.
+                * Adjustments are performed only if original and new memory
+                * sizes differ.
+                */
+               res->fail_memsz_adjust = false;
+               if (res->orig_sz != res->new_sz) {
+                       const struct btf_type *orig_t, *new_t;
+
+                       orig_t = btf__type_by_id(local_spec->btf, res->orig_type_id);
+                       new_t = btf__type_by_id(targ_spec->btf, res->new_type_id);
+
+                       /* There are two use cases in which it's safe to
+                        * adjust load/store's mem size:
+                        *   - reading a 32-bit kernel pointer, while on BPF
+                        *   size pointers are always 64-bit; in this case
+                        *   it's safe to "downsize" instruction size due to
+                        *   pointer being treated as unsigned integer with
+                        *   zero-extended upper 32-bits;
+                        *   - reading unsigned integers, again due to
+                        *   zero-extension is preserving the value correctly.
+                        *
+                        * In all other cases it's incorrect to attempt to
+                        * load/store field because read value will be
+                        * incorrect, so we poison relocated instruction.
+                        */
+                       if (btf_is_ptr(orig_t) && btf_is_ptr(new_t))
+                               goto done;
+                       if (btf_is_int(orig_t) && btf_is_int(new_t) &&
+                           btf_int_encoding(orig_t) != BTF_INT_SIGNED &&
+                           btf_int_encoding(new_t) != BTF_INT_SIGNED)
+                               goto done;
+
+                       /* mark as invalid mem size adjustment, but this will
+                        * only be checked for LDX/STX/ST insns
+                        */
+                       res->fail_memsz_adjust = true;
+               }
        } else if (core_relo_is_type_based(relo->kind)) {
                err = bpf_core_calc_type_relo(relo, local_spec, &res->orig_val);
                err = err ?: bpf_core_calc_type_relo(relo, targ_spec, &res->new_val);
@@ -5229,6 +5324,7 @@ static int bpf_core_calc_relo(const struct bpf_program *prog,
                err = err ?: bpf_core_calc_enumval_relo(relo, targ_spec, &res->new_val);
        }
 
+done:
        if (err == -EUCLEAN) {
                /* EUCLEAN is used to signal instruction poisoning request */
                res->poison = true;
@@ -5268,6 +5364,28 @@ static bool is_ldimm64(struct bpf_insn *insn)
        return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
 }
 
+static int insn_bpf_size_to_bytes(struct bpf_insn *insn)
+{
+       switch (BPF_SIZE(insn->code)) {
+       case BPF_DW: return 8;
+       case BPF_W: return 4;
+       case BPF_H: return 2;
+       case BPF_B: return 1;
+       default: return -1;
+       }
+}
+
+static int insn_bytes_to_bpf_size(__u32 sz)
+{
+       switch (sz) {
+       case 8: return BPF_DW;
+       case 4: return BPF_W;
+       case 2: return BPF_H;
+       case 1: return BPF_B;
+       default: return -1;
+       }
+}
+
 /*
  * Patch relocatable BPF instruction.
  *
@@ -5277,10 +5395,13 @@ static bool is_ldimm64(struct bpf_insn *insn)
  * spec, and is checked before patching instruction. If actual insn->imm value
  * is wrong, bail out with error.
  *
- * Currently three kinds of BPF instructions are supported:
+ * Currently supported classes of BPF instruction are:
  * 1. rX = <imm> (assignment with immediate operand);
  * 2. rX += <imm> (arithmetic operations with immediate operand);
- * 3. rX = <imm64> (load with 64-bit immediate value).
+ * 3. rX = <imm64> (load with 64-bit immediate value);
+ * 4. rX = *(T *)(rY + <off>), where T is one of {u8, u16, u32, u64};
+ * 5. *(T *)(rX + <off>) = rY, where T is one of {u8, u16, u32, u64};
+ * 6. *(T *)(rX + <off>) = <imm>, where T is one of {u8, u16, u32, u64}.
  */
 static int bpf_core_patch_insn(struct bpf_program *prog,
                               const struct bpf_core_relo *relo,
@@ -5304,6 +5425,7 @@ static int bpf_core_patch_insn(struct bpf_program *prog,
        class = BPF_CLASS(insn->code);
 
        if (res->poison) {
+poison:
                /* poison second part of ldimm64 to avoid confusing error from
                 * verifier about "unknown opcode 00"
                 */
@@ -5346,10 +5468,39 @@ static int bpf_core_patch_insn(struct bpf_program *prog,
                                prog->name, relo_idx, insn_idx, new_val);
                        return -ERANGE;
                }
+               if (res->fail_memsz_adjust) {
+                       pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) accesses field incorrectly. "
+                               "Make sure you are accessing pointers, unsigned integers, or fields of matching type and size.\n",
+                               prog->name, relo_idx, insn_idx);
+                       goto poison;
+               }
+
                orig_val = insn->off;
                insn->off = new_val;
                pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %u -> %u\n",
                         prog->name, relo_idx, insn_idx, orig_val, new_val);
+
+               if (res->new_sz != res->orig_sz) {
+                       int insn_bytes_sz, insn_bpf_sz;
+
+                       insn_bytes_sz = insn_bpf_size_to_bytes(insn);
+                       if (insn_bytes_sz != res->orig_sz) {
+                               pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) unexpected mem size: got %d, exp %u\n",
+                                       prog->name, relo_idx, insn_idx, insn_bytes_sz, res->orig_sz);
+                               return -EINVAL;
+                       }
+
+                       insn_bpf_sz = insn_bytes_to_bpf_size(res->new_sz);
+                       if (insn_bpf_sz < 0) {
+                               pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) invalid new mem size: %u\n",
+                                       prog->name, relo_idx, insn_idx, res->new_sz);
+                               return -EINVAL;
+                       }
+
+                       insn->code = BPF_MODE(insn->code) | insn_bpf_sz | BPF_CLASS(insn->code);
+                       pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) mem_sz %u -> %u\n",
+                                prog->name, relo_idx, insn_idx, res->orig_sz, res->new_sz);
+               }
                break;
        case BPF_LD: {
                __u64 imm;
@@ -5691,7 +5842,7 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path)
                return 0;
 
        if (targ_btf_path)
-               targ_btf = btf__parse_elf(targ_btf_path, NULL);
+               targ_btf = btf__parse(targ_btf_path, NULL);
        else
                targ_btf = obj->btf_vmlinux;
        if (IS_ERR_OR_NULL(targ_btf)) {
@@ -5742,6 +5893,11 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path)
                                err = -EINVAL;
                                goto out;
                        }
+                       /* no need to apply CO-RE relocation if the program is
+                        * not going to be loaded
+                        */
+                       if (!prog->load)
+                               continue;
 
                        err = bpf_core_apply_relo(prog, rec, i, obj->btf,
                                                  targ_btf, cand_cache);
@@ -5800,8 +5956,13 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog)
                                insn[0].imm = obj->maps[obj->kconfig_map_idx].fd;
                                insn[1].imm = ext->kcfg.data_off;
                        } else /* EXT_KSYM */ {
-                               insn[0].imm = (__u32)ext->ksym.addr;
-                               insn[1].imm = ext->ksym.addr >> 32;
+                               if (ext->ksym.type_id) { /* typed ksyms */
+                                       insn[0].src_reg = BPF_PSEUDO_BTF_ID;
+                                       insn[0].imm = ext->ksym.vmlinux_btf_id;
+                               } else { /* typeless ksyms */
+                                       insn[0].imm = (__u32)ext->ksym.addr;
+                                       insn[1].imm = ext->ksym.addr >> 32;
+                               }
                        }
                        relo->processed = true;
                        break;
@@ -6933,10 +7094,72 @@ out:
        return err;
 }
 
+static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj)
+{
+       struct extern_desc *ext;
+       int i, id;
+
+       for (i = 0; i < obj->nr_extern; i++) {
+               const struct btf_type *targ_var, *targ_type;
+               __u32 targ_type_id, local_type_id;
+               const char *targ_var_name;
+               int ret;
+
+               ext = &obj->externs[i];
+               if (ext->type != EXT_KSYM || !ext->ksym.type_id)
+                       continue;
+
+               id = btf__find_by_name_kind(obj->btf_vmlinux, ext->name,
+                                           BTF_KIND_VAR);
+               if (id <= 0) {
+                       pr_warn("extern (ksym) '%s': failed to find BTF ID in vmlinux BTF.\n",
+                               ext->name);
+                       return -ESRCH;
+               }
+
+               /* find local type_id */
+               local_type_id = ext->ksym.type_id;
+
+               /* find target type_id */
+               targ_var = btf__type_by_id(obj->btf_vmlinux, id);
+               targ_var_name = btf__name_by_offset(obj->btf_vmlinux,
+                                                   targ_var->name_off);
+               targ_type = skip_mods_and_typedefs(obj->btf_vmlinux,
+                                                  targ_var->type,
+                                                  &targ_type_id);
+
+               ret = bpf_core_types_are_compat(obj->btf, local_type_id,
+                                               obj->btf_vmlinux, targ_type_id);
+               if (ret <= 0) {
+                       const struct btf_type *local_type;
+                       const char *targ_name, *local_name;
+
+                       local_type = btf__type_by_id(obj->btf, local_type_id);
+                       local_name = btf__name_by_offset(obj->btf,
+                                                        local_type->name_off);
+                       targ_name = btf__name_by_offset(obj->btf_vmlinux,
+                                                       targ_type->name_off);
+
+                       pr_warn("extern (ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n",
+                               ext->name, local_type_id,
+                               btf_kind_str(local_type), local_name, targ_type_id,
+                               btf_kind_str(targ_type), targ_name);
+                       return -EINVAL;
+               }
+
+               ext->is_set = true;
+               ext->ksym.vmlinux_btf_id = id;
+               pr_debug("extern (ksym) '%s': resolved to [%d] %s %s\n",
+                        ext->name, id, btf_kind_str(targ_var), targ_var_name);
+       }
+       return 0;
+}
+
 static int bpf_object__resolve_externs(struct bpf_object *obj,
                                       const char *extra_kconfig)
 {
        bool need_config = false, need_kallsyms = false;
+       bool need_vmlinux_btf = false;
        struct extern_desc *ext;
        void *kcfg_data = NULL;
        int err, i;
@@ -6967,7 +7190,10 @@ static int bpf_object__resolve_externs(struct bpf_object *obj,
                           strncmp(ext->name, "CONFIG_", 7) == 0) {
                        need_config = true;
                } else if (ext->type == EXT_KSYM) {
-                       need_kallsyms = true;
+                       if (ext->ksym.type_id)
+                               need_vmlinux_btf = true;
+                       else
+                               need_kallsyms = true;
                } else {
                        pr_warn("unrecognized extern '%s'\n", ext->name);
                        return -EINVAL;
@@ -6996,6 +7222,11 @@ static int bpf_object__resolve_externs(struct bpf_object *obj,
                if (err)
                        return -EINVAL;
        }
+       if (need_vmlinux_btf) {
+               err = bpf_object__resolve_ksyms_btf_id(obj);
+               if (err)
+                       return -EINVAL;
+       }
        for (i = 0; i < obj->nr_extern; i++) {
                ext = &obj->externs[i];
 
@@ -7028,10 +7259,10 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
        }
 
        err = bpf_object__probe_loading(obj);
+       err = err ? : bpf_object__load_vmlinux_btf(obj);
        err = err ? : bpf_object__resolve_externs(obj, obj->kconfig);
        err = err ? : bpf_object__sanitize_and_load_btf(obj);
        err = err ? : bpf_object__sanitize_maps(obj);
-       err = err ? : bpf_object__load_vmlinux_btf(obj);
        err = err ? : bpf_object__init_kern_struct_ops_maps(obj);
        err = err ? : bpf_object__create_maps(obj);
        err = err ? : bpf_object__relocate(obj, attr->target_btf_path);
@@ -10353,9 +10584,8 @@ int bpf_program__set_attach_target(struct bpf_program *prog,
                btf_id = libbpf_find_prog_btf_id(attach_func_name,
                                                 attach_prog_fd);
        else
-               btf_id = __find_vmlinux_btf_id(prog->obj->btf_vmlinux,
-                                              attach_func_name,
-                                              prog->expected_attach_type);
+               btf_id = libbpf_find_vmlinux_btf_id(attach_func_name,
+                                                   prog->expected_attach_type);
 
        if (btf_id < 0)
                return btf_id;
index 30b4ca5..e3c98c0 100644 (file)
@@ -705,7 +705,7 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
        struct xsk_ctx *ctx;
        int err, ifindex;
 
-       if (!umem || !xsk_ptr || !(rx || tx) || !fill || !comp)
+       if (!umem || !xsk_ptr || !(rx || tx))
                return -EFAULT;
 
        xsk = calloc(1, sizeof(*xsk));
@@ -735,6 +735,11 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
 
        ctx = xsk_get_ctx(umem, ifindex, queue_id);
        if (!ctx) {
+               if (!fill || !comp) {
+                       err = -EFAULT;
+                       goto out_socket;
+               }
+
                ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id,
                                     fill, comp);
                if (!ctx) {
index 66acfcf..ac9eda8 100644 (file)
@@ -7,6 +7,44 @@ General instructions on running selftests can be found in
 Additional information about selftest failures are
 documented here.
 
+profiler[23] test failures with clang/llvm <12.0.0
+==================================================
+
+With clang/llvm <12.0.0, the profiler[23] test may fail.
+The symptom looks like
+
+.. code-block:: c
+
+  // r9 is a pointer to map_value
+  // r7 is a scalar
+  17:       bf 96 00 00 00 00 00 00 r6 = r9
+  18:       0f 76 00 00 00 00 00 00 r6 += r7
+  math between map_value pointer and register with unbounded min value is not allowed
+
+  // the instructions below will not be seen in the verifier log
+  19:       a5 07 01 00 01 01 00 00 if r7 < 257 goto +1
+  20:       bf 96 00 00 00 00 00 00 r6 = r9
+  // r6 is used here
+
+The verifier will reject such code with above error.
+At insn 18 the r7 is indeed unbounded. The later insn 19 checks the bounds and
+the insn 20 undoes map_value addition. It is currently impossible for the
+verifier to understand such speculative pointer arithmetic.
+Hence
+    https://reviews.llvm.org/D85570
+addresses it on the compiler side. It was committed on llvm 12.
+
+The corresponding C code
+.. code-block:: c
+
+  for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) {
+          filepart_length = bpf_probe_read_str(payload, ...);
+          if (filepart_length <= MAX_PATH) {
+                  barrier_var(filepart_length); // workaround
+                  payload += filepart_length;
+          }
+  }
+
 bpf_iter test failures with clang/llvm 10.0.0
 =============================================
 
index c548ade..5241405 100644 (file)
@@ -195,13 +195,13 @@ static struct bpf_align_test tests[] = {
                .prog_type = BPF_PROG_TYPE_SCHED_CLS,
                .matches = {
                        {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-                       {8, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+                       {8, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
                        {9, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
-                       {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+                       {10, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
                        {11, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
-                       {12, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+                       {12, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
                        {13, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
-                       {14, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+                       {14, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
                        {15, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
                        {16, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
                },
@@ -518,7 +518,7 @@ static struct bpf_align_test tests[] = {
                         * the total offset is 4-byte aligned and meets the
                         * load's requirements.
                         */
-                       {20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"},
+                       {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"},
 
                },
        },
@@ -561,18 +561,18 @@ static struct bpf_align_test tests[] = {
                        /* Adding 14 makes R6 be (4n+2) */
                        {11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"},
                        /* Subtracting from packet pointer overflows ubounds */
-                       {13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"},
+                       {13, "R5_w=pkt(id=2,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"},
                        /* New unknown value in R7 is (4n), >= 76 */
                        {15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"},
                        /* Adding it to packet pointer gives nice bounds again */
-                       {16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
+                       {16, "R5_w=pkt(id=3,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
                        /* At the time the word size load is performed from R5,
                         * its total fixed offset is NET_IP_ALIGN + reg->off (0)
                         * which is 2.  Then the variable offset is (4n+2), so
                         * the total offset is 4-byte aligned and meets the
                         * load's requirements.
                         */
-                       {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
+                       {20, "R5=pkt(id=3,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
                },
        },
 };
index 540fea4..76ebe4c 100644 (file)
@@ -55,10 +55,10 @@ static int kern_sync_rcu(void)
 
 static void test_lookup_update(void)
 {
-       int err, key = 0, val, i;
+       int map1_fd, map2_fd, map3_fd, map4_fd, map5_fd, map1_id, map2_id;
+       int outer_arr_fd, outer_hash_fd, outer_arr_dyn_fd;
        struct test_btf_map_in_map *skel;
-       int outer_arr_fd, outer_hash_fd;
-       int fd, map1_fd, map2_fd, map1_id, map2_id;
+       int err, key = 0, val, i, fd;
 
        skel = test_btf_map_in_map__open_and_load();
        if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n"))
@@ -70,32 +70,45 @@ static void test_lookup_update(void)
 
        map1_fd = bpf_map__fd(skel->maps.inner_map1);
        map2_fd = bpf_map__fd(skel->maps.inner_map2);
+       map3_fd = bpf_map__fd(skel->maps.inner_map3);
+       map4_fd = bpf_map__fd(skel->maps.inner_map4);
+       map5_fd = bpf_map__fd(skel->maps.inner_map5);
+       outer_arr_dyn_fd = bpf_map__fd(skel->maps.outer_arr_dyn);
        outer_arr_fd = bpf_map__fd(skel->maps.outer_arr);
        outer_hash_fd = bpf_map__fd(skel->maps.outer_hash);
 
-       /* inner1 = input, inner2 = input + 1 */
-       map1_fd = bpf_map__fd(skel->maps.inner_map1);
+       /* inner1 = input, inner2 = input + 1, inner3 = input + 2 */
        bpf_map_update_elem(outer_arr_fd, &key, &map1_fd, 0);
-       map2_fd = bpf_map__fd(skel->maps.inner_map2);
        bpf_map_update_elem(outer_hash_fd, &key, &map2_fd, 0);
+       bpf_map_update_elem(outer_arr_dyn_fd, &key, &map3_fd, 0);
        skel->bss->input = 1;
        usleep(1);
-
        bpf_map_lookup_elem(map1_fd, &key, &val);
        CHECK(val != 1, "inner1", "got %d != exp %d\n", val, 1);
        bpf_map_lookup_elem(map2_fd, &key, &val);
        CHECK(val != 2, "inner2", "got %d != exp %d\n", val, 2);
+       bpf_map_lookup_elem(map3_fd, &key, &val);
+       CHECK(val != 3, "inner3", "got %d != exp %d\n", val, 3);
 
-       /* inner1 = input + 1, inner2 = input */
+       /* inner2 = input, inner1 = input + 1, inner4 = input + 2 */
        bpf_map_update_elem(outer_arr_fd, &key, &map2_fd, 0);
        bpf_map_update_elem(outer_hash_fd, &key, &map1_fd, 0);
+       bpf_map_update_elem(outer_arr_dyn_fd, &key, &map4_fd, 0);
        skel->bss->input = 3;
        usleep(1);
-
        bpf_map_lookup_elem(map1_fd, &key, &val);
        CHECK(val != 4, "inner1", "got %d != exp %d\n", val, 4);
        bpf_map_lookup_elem(map2_fd, &key, &val);
        CHECK(val != 3, "inner2", "got %d != exp %d\n", val, 3);
+       bpf_map_lookup_elem(map4_fd, &key, &val);
+       CHECK(val != 5, "inner4", "got %d != exp %d\n", val, 5);
+
+       /* inner5 = input + 2 */
+       bpf_map_update_elem(outer_arr_dyn_fd, &key, &map5_fd, 0);
+       skel->bss->input = 5;
+       usleep(1);
+       bpf_map_lookup_elem(map5_fd, &key, &val);
+       CHECK(val != 7, "inner5", "got %d != exp %d\n", val, 7);
 
        for (i = 0; i < 5; i++) {
                val = i % 2 ? map1_fd : map2_fd;
@@ -106,7 +119,13 @@ static void test_lookup_update(void)
                }
                err = bpf_map_update_elem(outer_arr_fd, &key, &val, 0);
                if (CHECK_FAIL(err)) {
-                       printf("failed to update hash_of_maps on iter #%d\n", i);
+                       printf("failed to update array_of_maps on iter #%d\n", i);
+                       goto cleanup;
+               }
+               val = i % 2 ? map4_fd : map5_fd;
+               err = bpf_map_update_elem(outer_arr_dyn_fd, &key, &val, 0);
+               if (CHECK_FAIL(err)) {
+                       printf("failed to update array_of_maps (dyn) on iter #%d\n", i);
                        goto cleanup;
                }
        }
diff --git a/tools/testing/selftests/bpf/prog_tests/core_autosize.c b/tools/testing/selftests/bpf/prog_tests/core_autosize.c
new file mode 100644 (file)
index 0000000..981c251
--- /dev/null
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include <test_progs.h>
+#include <bpf/btf.h>
+
+/* real layout and sizes according to test's (32-bit) BTF
+ * needs to be defined before skeleton is included */
+struct test_struct___real {
+       unsigned int ptr; /* can't use `void *`, it is always 8 byte in BPF target */
+       unsigned int val2;
+       unsigned long long val1;
+       unsigned short val3;
+       unsigned char val4;
+       unsigned char _pad;
+};
+
+#include "test_core_autosize.skel.h"
+
+static int duration = 0;
+
+static struct {
+       unsigned long long ptr_samesized;
+       unsigned long long val1_samesized;
+       unsigned long long val2_samesized;
+       unsigned long long val3_samesized;
+       unsigned long long val4_samesized;
+       struct test_struct___real output_samesized;
+
+       unsigned long long ptr_downsized;
+       unsigned long long val1_downsized;
+       unsigned long long val2_downsized;
+       unsigned long long val3_downsized;
+       unsigned long long val4_downsized;
+       struct test_struct___real output_downsized;
+
+       unsigned long long ptr_probed;
+       unsigned long long val1_probed;
+       unsigned long long val2_probed;
+       unsigned long long val3_probed;
+       unsigned long long val4_probed;
+
+       unsigned long long ptr_signed;
+       unsigned long long val1_signed;
+       unsigned long long val2_signed;
+       unsigned long long val3_signed;
+       unsigned long long val4_signed;
+       struct test_struct___real output_signed;
+} out;
+
+void test_core_autosize(void)
+{
+       char btf_file[] = "/tmp/core_autosize.btf.XXXXXX";
+       int err, fd = -1, zero = 0;
+       int char_id, short_id, int_id, long_long_id, void_ptr_id, id;
+       struct test_core_autosize* skel = NULL;
+       struct bpf_object_load_attr load_attr = {};
+       struct bpf_program *prog;
+       struct bpf_map *bss_map;
+       struct btf *btf = NULL;
+       size_t written;
+       const void *raw_data;
+       __u32 raw_sz;
+       FILE *f = NULL;
+
+       btf = btf__new_empty();
+       if (!ASSERT_OK_PTR(btf, "empty_btf"))
+               return;
+       /* Emit the following struct with 32-bit pointer size:
+        *
+        * struct test_struct {
+        *     void *ptr;
+        *     unsigned long val2;
+        *     unsigned long long val1;
+        *     unsigned short val3;
+        *     unsigned char val4;
+        *     char: 8;
+        * };
+        *
+        * This struct is going to be used as the "kernel BTF" for this test.
+        * It's equivalent memory-layout-wise to test_struct__real above.
+        */
+
+       /* force 32-bit pointer size */
+       btf__set_pointer_size(btf, 4);
+
+       char_id = btf__add_int(btf, "unsigned char", 1, 0);
+       ASSERT_EQ(char_id, 1, "char_id");
+       short_id = btf__add_int(btf, "unsigned short", 2, 0);
+       ASSERT_EQ(short_id, 2, "short_id");
+       /* "long unsigned int" of 4 byte size tells BTF that sizeof(void *) == 4 */
+       int_id = btf__add_int(btf, "long unsigned int", 4, 0);
+       ASSERT_EQ(int_id, 3, "int_id");
+       long_long_id = btf__add_int(btf, "unsigned long long", 8, 0);
+       ASSERT_EQ(long_long_id, 4, "long_long_id");
+       void_ptr_id = btf__add_ptr(btf, 0);
+       ASSERT_EQ(void_ptr_id, 5, "void_ptr_id");
+
+       id = btf__add_struct(btf, "test_struct", 20 /* bytes */);
+       ASSERT_EQ(id, 6, "struct_id");
+       err = btf__add_field(btf, "ptr", void_ptr_id, 0, 0);
+       err = err ?: btf__add_field(btf, "val2", int_id, 32, 0);
+       err = err ?: btf__add_field(btf, "val1", long_long_id, 64, 0);
+       err = err ?: btf__add_field(btf, "val3", short_id, 128, 0);
+       err = err ?: btf__add_field(btf, "val4", char_id, 144, 0);
+       ASSERT_OK(err, "struct_fields");
+
+       fd = mkstemp(btf_file);
+       if (CHECK(fd < 0, "btf_tmp", "failed to create file: %d\n", fd))
+               goto cleanup;
+       f = fdopen(fd, "w");
+       if (!ASSERT_OK_PTR(f, "btf_fdopen"))
+               goto cleanup;
+
+       raw_data = btf__get_raw_data(btf, &raw_sz);
+       if (!ASSERT_OK_PTR(raw_data, "raw_data"))
+               goto cleanup;
+       written = fwrite(raw_data, 1, raw_sz, f);
+       if (CHECK(written != raw_sz, "btf_write", "written: %zu, errno: %d\n", written, errno))
+               goto cleanup;
+       fflush(f);
+       fclose(f);
+       f = NULL;
+       close(fd);
+       fd = -1;
+
+       /* open and load BPF program with custom BTF as the kernel BTF */
+       skel = test_core_autosize__open();
+       if (!ASSERT_OK_PTR(skel, "skel_open"))
+               return;
+
+       /* disable handle_signed() for now */
+       prog = bpf_object__find_program_by_name(skel->obj, "handle_signed");
+       if (!ASSERT_OK_PTR(prog, "prog_find"))
+               goto cleanup;
+       bpf_program__set_autoload(prog, false);
+
+       load_attr.obj = skel->obj;
+       load_attr.target_btf_path = btf_file;
+       err = bpf_object__load_xattr(&load_attr);
+       if (!ASSERT_OK(err, "prog_load"))
+               goto cleanup;
+
+       prog = bpf_object__find_program_by_name(skel->obj, "handle_samesize");
+       if (!ASSERT_OK_PTR(prog, "prog_find"))
+               goto cleanup;
+       skel->links.handle_samesize = bpf_program__attach(prog);
+       if (!ASSERT_OK_PTR(skel->links.handle_samesize, "prog_attach"))
+               goto cleanup;
+
+       prog = bpf_object__find_program_by_name(skel->obj, "handle_downsize");
+       if (!ASSERT_OK_PTR(prog, "prog_find"))
+               goto cleanup;
+       skel->links.handle_downsize = bpf_program__attach(prog);
+       if (!ASSERT_OK_PTR(skel->links.handle_downsize, "prog_attach"))
+               goto cleanup;
+
+       prog = bpf_object__find_program_by_name(skel->obj, "handle_probed");
+       if (!ASSERT_OK_PTR(prog, "prog_find"))
+               goto cleanup;
+       skel->links.handle_probed = bpf_program__attach(prog);
+       if (!ASSERT_OK_PTR(skel->links.handle_probed, "prog_attach"))
+               goto cleanup;
+
+       usleep(1);
+
+       bss_map = bpf_object__find_map_by_name(skel->obj, "test_cor.bss");
+       if (!ASSERT_OK_PTR(bss_map, "bss_map_find"))
+               goto cleanup;
+
+       err = bpf_map_lookup_elem(bpf_map__fd(bss_map), &zero, (void *)&out);
+       if (!ASSERT_OK(err, "bss_lookup"))
+               goto cleanup;
+
+       ASSERT_EQ(out.ptr_samesized, 0x01020304, "ptr_samesized");
+       ASSERT_EQ(out.val1_samesized, 0x1020304050607080, "val1_samesized");
+       ASSERT_EQ(out.val2_samesized, 0x0a0b0c0d, "val2_samesized");
+       ASSERT_EQ(out.val3_samesized, 0xfeed, "val3_samesized");
+       ASSERT_EQ(out.val4_samesized, 0xb9, "val4_samesized");
+       ASSERT_EQ(out.output_samesized.ptr, 0x01020304, "ptr_samesized");
+       ASSERT_EQ(out.output_samesized.val1, 0x1020304050607080, "val1_samesized");
+       ASSERT_EQ(out.output_samesized.val2, 0x0a0b0c0d, "val2_samesized");
+       ASSERT_EQ(out.output_samesized.val3, 0xfeed, "val3_samesized");
+       ASSERT_EQ(out.output_samesized.val4, 0xb9, "val4_samesized");
+
+       ASSERT_EQ(out.ptr_downsized, 0x01020304, "ptr_downsized");
+       ASSERT_EQ(out.val1_downsized, 0x1020304050607080, "val1_downsized");
+       ASSERT_EQ(out.val2_downsized, 0x0a0b0c0d, "val2_downsized");
+       ASSERT_EQ(out.val3_downsized, 0xfeed, "val3_downsized");
+       ASSERT_EQ(out.val4_downsized, 0xb9, "val4_downsized");
+       ASSERT_EQ(out.output_downsized.ptr, 0x01020304, "ptr_downsized");
+       ASSERT_EQ(out.output_downsized.val1, 0x1020304050607080, "val1_downsized");
+       ASSERT_EQ(out.output_downsized.val2, 0x0a0b0c0d, "val2_downsized");
+       ASSERT_EQ(out.output_downsized.val3, 0xfeed, "val3_downsized");
+       ASSERT_EQ(out.output_downsized.val4, 0xb9, "val4_downsized");
+
+       ASSERT_EQ(out.ptr_probed, 0x01020304, "ptr_probed");
+       ASSERT_EQ(out.val1_probed, 0x1020304050607080, "val1_probed");
+       ASSERT_EQ(out.val2_probed, 0x0a0b0c0d, "val2_probed");
+       ASSERT_EQ(out.val3_probed, 0xfeed, "val3_probed");
+       ASSERT_EQ(out.val4_probed, 0xb9, "val4_probed");
+
+       test_core_autosize__destroy(skel);
+       skel = NULL;
+
+       /* now re-load with handle_signed() enabled, it should fail loading */
+       skel = test_core_autosize__open();
+       if (!ASSERT_OK_PTR(skel, "skel_open"))
+               return;
+
+       load_attr.obj = skel->obj;
+       load_attr.target_btf_path = btf_file;
+       err = bpf_object__load_xattr(&load_attr);
+       if (!ASSERT_ERR(err, "bad_prog_load"))
+               goto cleanup;
+
+cleanup:
+       if (f)
+               fclose(f);
+       if (fd >= 0)
+               close(fd);
+       remove(btf_file);
+       btf__free(btf);
+       test_core_autosize__destroy(skel);
+}
index b771804..b295969 100644 (file)
@@ -7,40 +7,28 @@
 
 static int duration;
 
-static __u64 kallsyms_find(const char *sym)
-{
-       char type, name[500];
-       __u64 addr, res = 0;
-       FILE *f;
-
-       f = fopen("/proc/kallsyms", "r");
-       if (CHECK(!f, "kallsyms_fopen", "failed to open: %d\n", errno))
-               return 0;
-
-       while (fscanf(f, "%llx %c %499s%*[^\n]\n", &addr, &type, name) > 0) {
-               if (strcmp(name, sym) == 0) {
-                       res = addr;
-                       goto out;
-               }
-       }
-
-       CHECK(false, "not_found", "symbol %s not found\n", sym);
-out:
-       fclose(f);
-       return res;
-}
-
 void test_ksyms(void)
 {
-       __u64 per_cpu_start_addr = kallsyms_find("__per_cpu_start");
-       __u64 link_fops_addr = kallsyms_find("bpf_link_fops");
        const char *btf_path = "/sys/kernel/btf/vmlinux";
        struct test_ksyms *skel;
        struct test_ksyms__data *data;
+       __u64 link_fops_addr, per_cpu_start_addr;
        struct stat st;
        __u64 btf_size;
        int err;
 
+       err = kallsyms_find("bpf_link_fops", &link_fops_addr);
+       if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
+               return;
+       if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_link_fops' not found\n"))
+               return;
+
+       err = kallsyms_find("__per_cpu_start", &per_cpu_start_addr);
+       if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
+               return;
+       if (CHECK(err == -ENOENT, "ksym_find", "symbol 'per_cpu_start' not found\n"))
+               return;
+
        if (CHECK(stat(btf_path, &st), "stat_btf", "err %d\n", errno))
                return;
        btf_size = st.st_size;
diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
new file mode 100644 (file)
index 0000000..28e26bd
--- /dev/null
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Google */
+
+#include <test_progs.h>
+#include <bpf/libbpf.h>
+#include <bpf/btf.h>
+#include "test_ksyms_btf.skel.h"
+
+static int duration;
+
+void test_ksyms_btf(void)
+{
+       __u64 runqueues_addr, bpf_prog_active_addr;
+       __u32 this_rq_cpu;
+       int this_bpf_prog_active;
+       struct test_ksyms_btf *skel = NULL;
+       struct test_ksyms_btf__data *data;
+       struct btf *btf;
+       int percpu_datasec;
+       int err;
+
+       err = kallsyms_find("runqueues", &runqueues_addr);
+       if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
+               return;
+       if (CHECK(err == -ENOENT, "ksym_find", "symbol 'runqueues' not found\n"))
+               return;
+
+       err = kallsyms_find("bpf_prog_active", &bpf_prog_active_addr);
+       if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
+               return;
+       if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_prog_active' not found\n"))
+               return;
+
+       btf = libbpf_find_kernel_btf();
+       if (CHECK(IS_ERR(btf), "btf_exists", "failed to load kernel BTF: %ld\n",
+                 PTR_ERR(btf)))
+               return;
+
+       percpu_datasec = btf__find_by_name_kind(btf, ".data..percpu",
+                                               BTF_KIND_DATASEC);
+       if (percpu_datasec < 0) {
+               printf("%s:SKIP:no PERCPU DATASEC in kernel btf\n",
+                      __func__);
+               test__skip();
+               goto cleanup;
+       }
+
+       skel = test_ksyms_btf__open_and_load();
+       if (CHECK(!skel, "skel_open", "failed to open and load skeleton\n"))
+               goto cleanup;
+
+       err = test_ksyms_btf__attach(skel);
+       if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+               goto cleanup;
+
+       /* trigger tracepoint */
+       usleep(1);
+
+       data = skel->data;
+       CHECK(data->out__runqueues_addr != runqueues_addr, "runqueues_addr",
+             "got %llu, exp %llu\n",
+             (unsigned long long)data->out__runqueues_addr,
+             (unsigned long long)runqueues_addr);
+       CHECK(data->out__bpf_prog_active_addr != bpf_prog_active_addr, "bpf_prog_active_addr",
+             "got %llu, exp %llu\n",
+             (unsigned long long)data->out__bpf_prog_active_addr,
+             (unsigned long long)bpf_prog_active_addr);
+
+       CHECK(data->out__rq_cpu == -1, "rq_cpu",
+             "got %u, exp != -1\n", data->out__rq_cpu);
+       CHECK(data->out__bpf_prog_active < 0, "bpf_prog_active",
+             "got %d, exp >= 0\n", data->out__bpf_prog_active);
+       CHECK(data->out__cpu_0_rq_cpu != 0, "cpu_rq(0)->cpu",
+             "got %u, exp 0\n", data->out__cpu_0_rq_cpu);
+
+       this_rq_cpu = data->out__this_rq_cpu;
+       CHECK(this_rq_cpu != data->out__rq_cpu, "this_rq_cpu",
+             "got %u, exp %u\n", this_rq_cpu, data->out__rq_cpu);
+
+       this_bpf_prog_active = data->out__this_bpf_prog_active;
+       CHECK(this_bpf_prog_active != data->out__bpf_prog_active, "this_bpf_prog_active",
+             "got %d, exp %d\n", this_bpf_prog_active,
+             data->out__bpf_prog_active);
+
+cleanup:
+       btf__free(btf);
+       test_ksyms_btf__destroy(skel);
+}
index 0419525..fcf54b3 100644 (file)
@@ -37,7 +37,7 @@ void test_pinning(void)
        struct stat statbuf = {};
        struct bpf_object *obj;
        struct bpf_map *map;
-       int err;
+       int err, map_fd;
        DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts,
                .pin_root_path = custpath,
        );
@@ -213,6 +213,53 @@ void test_pinning(void)
        if (CHECK(err, "stat custpinpath", "err %d errno %d\n", err, errno))
                goto out;
 
+       /* remove the custom pin path to re-test it with reuse fd below */
+       err = unlink(custpinpath);
+       if (CHECK(err, "unlink custpinpath", "err %d errno %d\n", err, errno))
+               goto out;
+
+       err = rmdir(custpath);
+       if (CHECK(err, "rmdir custpindir", "err %d errno %d\n", err, errno))
+               goto out;
+
+       bpf_object__close(obj);
+
+       /* test pinning at custom path with reuse fd */
+       obj = bpf_object__open_file(file, NULL);
+       err = libbpf_get_error(obj);
+       if (CHECK(err, "default open", "err %d errno %d\n", err, errno)) {
+               obj = NULL;
+               goto out;
+       }
+
+       map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(__u32),
+                               sizeof(__u64), 1, 0);
+       if (CHECK(map_fd < 0, "create pinmap manually", "fd %d\n", map_fd))
+               goto out;
+
+       map = bpf_object__find_map_by_name(obj, "pinmap");
+       if (CHECK(!map, "find map", "NULL map"))
+               goto close_map_fd;
+
+       err = bpf_map__reuse_fd(map, map_fd);
+       if (CHECK(err, "reuse pinmap fd", "err %d errno %d\n", err, errno))
+               goto close_map_fd;
+
+       err = bpf_map__set_pin_path(map, custpinpath);
+       if (CHECK(err, "set pin path", "err %d errno %d\n", err, errno))
+               goto close_map_fd;
+
+       err = bpf_object__load(obj);
+       if (CHECK(err, "custom load", "err %d errno %d\n", err, errno))
+               goto close_map_fd;
+
+       /* check that pinmap was pinned at the custom path */
+       err = stat(custpinpath, &statbuf);
+       if (CHECK(err, "stat custpinpath", "err %d errno %d\n", err, errno))
+               goto close_map_fd;
+
+close_map_fd:
+       close(map_fd);
 out:
        unlink(pinpath);
        unlink(nopinpath);
index 4c4224e..85f7326 100644 (file)
@@ -198,7 +198,7 @@ static void test_sockmap_copy(enum bpf_map_type map_type)
 {
        DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
        int err, len, src_fd, iter_fd, duration = 0;
-       union bpf_iter_link_info linfo = {0};
+       union bpf_iter_link_info linfo = {};
        __u32 i, num_sockets, num_elems;
        struct bpf_iter_sockmap *skel;
        __s64 *sock_fd = NULL;
index 24ba0d2..c86e672 100644 (file)
@@ -264,9 +264,19 @@ static int check_error_linum(const struct sk_fds *sk_fds)
 
 static void check_hdr_and_close_fds(struct sk_fds *sk_fds)
 {
+       const __u32 expected_inherit_cb_flags =
+               BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG |
+               BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG |
+               BPF_SOCK_OPS_STATE_CB_FLAG;
+
        if (sk_fds_shutdown(sk_fds))
                goto check_linum;
 
+       if (CHECK(expected_inherit_cb_flags != skel->bss->inherit_cb_flags,
+                 "Unexpected inherit_cb_flags", "0x%x != 0x%x\n",
+                 skel->bss->inherit_cb_flags, expected_inherit_cb_flags))
+               goto check_linum;
+
        if (check_hdr_stg(&exp_passive_hdr_stg, sk_fds->passive_fd,
                          "passive_hdr_stg"))
                goto check_linum;
@@ -321,6 +331,8 @@ static void reset_test(void)
        memset(&skel->bss->active_estab_in, 0, optsize);
        memset(&skel->bss->active_fin_in, 0, optsize);
 
+       skel->bss->inherit_cb_flags = 0;
+
        skel->data->test_kind = TCPOPT_EXP;
        skel->data->test_magic = 0xeB9F;
 
diff --git a/tools/testing/selftests/bpf/prog_tests/test_profiler.c b/tools/testing/selftests/bpf/prog_tests/test_profiler.c
new file mode 100644 (file)
index 0000000..4ca2751
--- /dev/null
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <test_progs.h>
+#include "progs/profiler.h"
+#include "profiler1.skel.h"
+#include "profiler2.skel.h"
+#include "profiler3.skel.h"
+
+static int sanity_run(struct bpf_program *prog)
+{
+       struct bpf_prog_test_run_attr test_attr = {};
+       __u64 args[] = {1, 2, 3};
+       __u32 duration = 0;
+       int err, prog_fd;
+
+       prog_fd = bpf_program__fd(prog);
+       test_attr.prog_fd = prog_fd;
+       test_attr.ctx_in = args;
+       test_attr.ctx_size_in = sizeof(args);
+       err = bpf_prog_test_run_xattr(&test_attr);
+       if (CHECK(err || test_attr.retval, "test_run",
+                 "err %d errno %d retval %d duration %d\n",
+                 err, errno, test_attr.retval, duration))
+               return -1;
+       return 0;
+}
+
+void test_test_profiler(void)
+{
+       struct profiler1 *profiler1_skel = NULL;
+       struct profiler2 *profiler2_skel = NULL;
+       struct profiler3 *profiler3_skel = NULL;
+       __u32 duration = 0;
+       int err;
+
+       profiler1_skel = profiler1__open_and_load();
+       if (CHECK(!profiler1_skel, "profiler1_skel_load", "profiler1 skeleton failed\n"))
+               goto cleanup;
+
+       err = profiler1__attach(profiler1_skel);
+       if (CHECK(err, "profiler1_attach", "profiler1 attach failed: %d\n", err))
+               goto cleanup;
+
+       if (sanity_run(profiler1_skel->progs.raw_tracepoint__sched_process_exec))
+               goto cleanup;
+
+       profiler2_skel = profiler2__open_and_load();
+       if (CHECK(!profiler2_skel, "profiler2_skel_load", "profiler2 skeleton failed\n"))
+               goto cleanup;
+
+       err = profiler2__attach(profiler2_skel);
+       if (CHECK(err, "profiler2_attach", "profiler2 attach failed: %d\n", err))
+               goto cleanup;
+
+       if (sanity_run(profiler2_skel->progs.raw_tracepoint__sched_process_exec))
+               goto cleanup;
+
+       profiler3_skel = profiler3__open_and_load();
+       if (CHECK(!profiler3_skel, "profiler3_skel_load", "profiler3 skeleton failed\n"))
+               goto cleanup;
+
+       err = profiler3__attach(profiler3_skel);
+       if (CHECK(err, "profiler3_attach", "profiler3 attach failed: %d\n", err))
+               goto cleanup;
+
+       if (sanity_run(profiler3_skel->progs.raw_tracepoint__sched_process_exec))
+               goto cleanup;
+cleanup:
+       profiler1__destroy(profiler1_skel);
+       profiler2__destroy(profiler2_skel);
+       profiler3__destroy(profiler3_skel);
+}
index a1f0642..0281095 100644 (file)
@@ -25,7 +25,7 @@ void test_xdp_noinline(void)
                __u8 flags;
        } real_def = {.dst = MAGIC_VAL};
        __u32 ch_key = 11, real_num = 3;
-       __u32 duration, retval, size;
+       __u32 duration = 0, retval, size;
        int err, i;
        __u64 bytes = 0, pkts = 0;
        char buf[128];
index b1b2773..a943d39 100644 (file)
 #define TCP_CA_NAME_MAX 16
 #endif
 
+#ifndef TCP_NOTSENT_LOWAT
+#define TCP_NOTSENT_LOWAT 25
+#endif
+
 #ifndef IFNAMSIZ
 #define IFNAMSIZ 16
 #endif
@@ -128,6 +132,18 @@ static __inline int set_keepalive(struct bpf_sock_addr *ctx)
        return 0;
 }
 
+static __inline int set_notsent_lowat(struct bpf_sock_addr *ctx)
+{
+       int lowat = 65535;
+
+       if (ctx->type == SOCK_STREAM) {
+               if (bpf_setsockopt(ctx, SOL_TCP, TCP_NOTSENT_LOWAT, &lowat, sizeof(lowat)))
+                       return 1;
+       }
+
+       return 0;
+}
+
 SEC("cgroup/connect4")
 int connect_v4_prog(struct bpf_sock_addr *ctx)
 {
@@ -148,6 +164,9 @@ int connect_v4_prog(struct bpf_sock_addr *ctx)
        if (set_keepalive(ctx))
                return 0;
 
+       if (set_notsent_lowat(ctx))
+               return 0;
+
        if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM)
                return 0;
        else if (ctx->type == SOCK_STREAM)
diff --git a/tools/testing/selftests/bpf/progs/profiler.h b/tools/testing/selftests/bpf/progs/profiler.h
new file mode 100644 (file)
index 0000000..3bac4fd
--- /dev/null
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#pragma once
+
+#define TASK_COMM_LEN 16
+#define MAX_ANCESTORS 4
+#define MAX_PATH 256
+#define KILL_TARGET_LEN 64
+#define CTL_MAXNAME 10
+#define MAX_ARGS_LEN 4096
+#define MAX_FILENAME_LEN 512
+#define MAX_ENVIRON_LEN 8192
+#define MAX_PATH_DEPTH 32
+#define MAX_FILEPATH_LENGTH (MAX_PATH_DEPTH * MAX_PATH)
+#define MAX_CGROUPS_PATH_DEPTH 8
+
+#define MAX_METADATA_PAYLOAD_LEN TASK_COMM_LEN
+
+#define MAX_CGROUP_PAYLOAD_LEN \
+       (MAX_PATH * 2 + (MAX_PATH * MAX_CGROUPS_PATH_DEPTH))
+
+#define MAX_CAP_PAYLOAD_LEN (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN)
+
+#define MAX_SYSCTL_PAYLOAD_LEN \
+       (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + CTL_MAXNAME + MAX_PATH)
+
+#define MAX_KILL_PAYLOAD_LEN \
+       (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + TASK_COMM_LEN + \
+        KILL_TARGET_LEN)
+
+#define MAX_EXEC_PAYLOAD_LEN \
+       (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + MAX_FILENAME_LEN + \
+        MAX_ARGS_LEN + MAX_ENVIRON_LEN)
+
+#define MAX_FILEMOD_PAYLOAD_LEN \
+       (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + MAX_FILEPATH_LENGTH + \
+        MAX_FILEPATH_LENGTH)
+
+enum data_type {
+       INVALID_EVENT,
+       EXEC_EVENT,
+       FORK_EVENT,
+       KILL_EVENT,
+       SYSCTL_EVENT,
+       FILEMOD_EVENT,
+       MAX_DATA_TYPE_EVENT
+};
+
+enum filemod_type {
+       FMOD_OPEN,
+       FMOD_LINK,
+       FMOD_SYMLINK,
+};
+
+struct ancestors_data_t {
+       pid_t ancestor_pids[MAX_ANCESTORS];
+       uint32_t ancestor_exec_ids[MAX_ANCESTORS];
+       uint64_t ancestor_start_times[MAX_ANCESTORS];
+       uint32_t num_ancestors;
+};
+
+struct var_metadata_t {
+       enum data_type type;
+       pid_t pid;
+       uint32_t exec_id;
+       uid_t uid;
+       gid_t gid;
+       uint64_t start_time;
+       uint32_t cpu_id;
+       uint64_t bpf_stats_num_perf_events;
+       uint64_t bpf_stats_start_ktime_ns;
+       uint8_t comm_length;
+};
+
+struct cgroup_data_t {
+       ino_t cgroup_root_inode;
+       ino_t cgroup_proc_inode;
+       uint64_t cgroup_root_mtime;
+       uint64_t cgroup_proc_mtime;
+       uint16_t cgroup_root_length;
+       uint16_t cgroup_proc_length;
+       uint16_t cgroup_full_length;
+       int cgroup_full_path_root_pos;
+};
+
+struct var_sysctl_data_t {
+       struct var_metadata_t meta;
+       struct cgroup_data_t cgroup_data;
+       struct ancestors_data_t ancestors_info;
+       uint8_t sysctl_val_length;
+       uint16_t sysctl_path_length;
+       char payload[MAX_SYSCTL_PAYLOAD_LEN];
+};
+
+struct var_kill_data_t {
+       struct var_metadata_t meta;
+       struct cgroup_data_t cgroup_data;
+       struct ancestors_data_t ancestors_info;
+       pid_t kill_target_pid;
+       int kill_sig;
+       uint32_t kill_count;
+       uint64_t last_kill_time;
+       uint8_t kill_target_name_length;
+       uint8_t kill_target_cgroup_proc_length;
+       char payload[MAX_KILL_PAYLOAD_LEN];
+       size_t payload_length;
+};
+
+struct var_exec_data_t {
+       struct var_metadata_t meta;
+       struct cgroup_data_t cgroup_data;
+       pid_t parent_pid;
+       uint32_t parent_exec_id;
+       uid_t parent_uid;
+       uint64_t parent_start_time;
+       uint16_t bin_path_length;
+       uint16_t cmdline_length;
+       uint16_t environment_length;
+       char payload[MAX_EXEC_PAYLOAD_LEN];
+};
+
+struct var_fork_data_t {
+       struct var_metadata_t meta;
+       pid_t parent_pid;
+       uint32_t parent_exec_id;
+       uint64_t parent_start_time;
+       char payload[MAX_METADATA_PAYLOAD_LEN];
+};
+
+struct var_filemod_data_t {
+       struct var_metadata_t meta;
+       struct cgroup_data_t cgroup_data;
+       enum filemod_type fmod_type;
+       unsigned int dst_flags;
+       uint32_t src_device_id;
+       uint32_t dst_device_id;
+       ino_t src_inode;
+       ino_t dst_inode;
+       uint16_t src_filepath_length;
+       uint16_t dst_filepath_length;
+       char payload[MAX_FILEMOD_PAYLOAD_LEN];
+};
+
+struct profiler_config_struct {
+       bool fetch_cgroups_from_bpf;
+       ino_t cgroup_fs_inode;
+       ino_t cgroup_login_session_inode;
+       uint64_t kill_signals_mask;
+       ino_t inode_filter;
+       uint32_t stale_info_secs;
+       bool use_variable_buffers;
+       bool read_environ_from_exec;
+       bool enable_cgroup_v1_resolver;
+};
+
+struct bpf_func_stats_data {
+       uint64_t time_elapsed_ns;
+       uint64_t num_executions;
+       uint64_t num_perf_events;
+};
+
+struct bpf_func_stats_ctx {
+       uint64_t start_time_ns;
+       struct bpf_func_stats_data* bpf_func_stats_data_val;
+};
+
+enum bpf_function_id {
+       profiler_bpf_proc_sys_write,
+       profiler_bpf_sched_process_exec,
+       profiler_bpf_sched_process_exit,
+       profiler_bpf_sys_enter_kill,
+       profiler_bpf_do_filp_open_ret,
+       profiler_bpf_sched_process_fork,
+       profiler_bpf_vfs_link,
+       profiler_bpf_vfs_symlink,
+       profiler_bpf_max_function_id
+};
diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h
new file mode 100644 (file)
index 0000000..0057831
--- /dev/null
@@ -0,0 +1,969 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <vmlinux.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#include "profiler.h"
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+#define O_WRONLY 00000001
+#define O_RDWR 00000002
+#define O_DIRECTORY 00200000
+#define __O_TMPFILE 020000000
+#define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
+#define MAX_ERRNO 4095
+#define S_IFMT 00170000
+#define S_IFSOCK 0140000
+#define S_IFLNK 0120000
+#define S_IFREG 0100000
+#define S_IFBLK 0060000
+#define S_IFDIR 0040000
+#define S_IFCHR 0020000
+#define S_IFIFO 0010000
+#define S_ISUID 0004000
+#define S_ISGID 0002000
+#define S_ISVTX 0001000
+#define S_ISLNK(m) (((m)&S_IFMT) == S_IFLNK)
+#define S_ISDIR(m) (((m)&S_IFMT) == S_IFDIR)
+#define S_ISCHR(m) (((m)&S_IFMT) == S_IFCHR)
+#define S_ISBLK(m) (((m)&S_IFMT) == S_IFBLK)
+#define S_ISFIFO(m) (((m)&S_IFMT) == S_IFIFO)
+#define S_ISSOCK(m) (((m)&S_IFMT) == S_IFSOCK)
+#define IS_ERR_VALUE(x) (unsigned long)(void*)(x) >= (unsigned long)-MAX_ERRNO
+
+#define KILL_DATA_ARRAY_SIZE 8
+
+struct var_kill_data_arr_t {
+       struct var_kill_data_t array[KILL_DATA_ARRAY_SIZE];
+};
+
+union any_profiler_data_t {
+       struct var_exec_data_t var_exec;
+       struct var_kill_data_t var_kill;
+       struct var_sysctl_data_t var_sysctl;
+       struct var_filemod_data_t var_filemod;
+       struct var_fork_data_t var_fork;
+       struct var_kill_data_arr_t var_kill_data_arr;
+};
+
+volatile struct profiler_config_struct bpf_config = {};
+
+#define FETCH_CGROUPS_FROM_BPF (bpf_config.fetch_cgroups_from_bpf)
+#define CGROUP_FS_INODE (bpf_config.cgroup_fs_inode)
+#define CGROUP_LOGIN_SESSION_INODE \
+       (bpf_config.cgroup_login_session_inode)
+#define KILL_SIGNALS (bpf_config.kill_signals_mask)
+#define STALE_INFO (bpf_config.stale_info_secs)
+#define INODE_FILTER (bpf_config.inode_filter)
+#define READ_ENVIRON_FROM_EXEC (bpf_config.read_environ_from_exec)
+#define ENABLE_CGROUP_V1_RESOLVER (bpf_config.enable_cgroup_v1_resolver)
+
+struct kernfs_iattrs___52 {
+       struct iattr ia_iattr;
+};
+
+struct kernfs_node___52 {
+       union /* kernfs_node_id */ {
+               struct {
+                       u32 ino;
+                       u32 generation;
+               };
+               u64 id;
+       } id;
+};
+
+struct {
+       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+       __uint(max_entries, 1);
+       __type(key, u32);
+       __type(value, union any_profiler_data_t);
+} data_heap SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+       __uint(key_size, sizeof(int));
+       __uint(value_size, sizeof(int));
+} events SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __uint(max_entries, KILL_DATA_ARRAY_SIZE);
+       __type(key, u32);
+       __type(value, struct var_kill_data_arr_t);
+} var_tpid_to_data SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+       __uint(max_entries, profiler_bpf_max_function_id);
+       __type(key, u32);
+       __type(value, struct bpf_func_stats_data);
+} bpf_func_stats SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __type(key, u32);
+       __type(value, bool);
+       __uint(max_entries, 16);
+} allowed_devices SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __type(key, u64);
+       __type(value, bool);
+       __uint(max_entries, 1024);
+} allowed_file_inodes SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __type(key, u64);
+       __type(value, bool);
+       __uint(max_entries, 1024);
+} allowed_directory_inodes SEC(".maps");
+
+struct {
+       __uint(type, BPF_MAP_TYPE_HASH);
+       __type(key, u32);
+       __type(value, bool);
+       __uint(max_entries, 16);
+} disallowed_exec_inodes SEC(".maps");
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
+#endif
+
+static INLINE bool IS_ERR(const void* ptr)
+{
+       return IS_ERR_VALUE((unsigned long)ptr);
+}
+
+static INLINE u32 get_userspace_pid()
+{
+       return bpf_get_current_pid_tgid() >> 32;
+}
+
+static INLINE bool is_init_process(u32 tgid)
+{
+       return tgid == 1 || tgid == 0;
+}
+
+static INLINE unsigned long
+probe_read_lim(void* dst, void* src, unsigned long len, unsigned long max)
+{
+       len = len < max ? len : max;
+       if (len > 1) {
+               if (bpf_probe_read(dst, len, src))
+                       return 0;
+       } else if (len == 1) {
+               if (bpf_probe_read(dst, 1, src))
+                       return 0;
+       }
+       return len;
+}
+
+static INLINE int get_var_spid_index(struct var_kill_data_arr_t* arr_struct,
+                                    int spid)
+{
+#ifdef UNROLL
+#pragma unroll
+#endif
+       for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++)
+               if (arr_struct->array[i].meta.pid == spid)
+                       return i;
+       return -1;
+}
+
+static INLINE void populate_ancestors(struct task_struct* task,
+                                     struct ancestors_data_t* ancestors_data)
+{
+       struct task_struct* parent = task;
+       u32 num_ancestors, ppid;
+
+       ancestors_data->num_ancestors = 0;
+#ifdef UNROLL
+#pragma unroll
+#endif
+       for (num_ancestors = 0; num_ancestors < MAX_ANCESTORS; num_ancestors++) {
+               parent = BPF_CORE_READ(parent, real_parent);
+               if (parent == NULL)
+                       break;
+               ppid = BPF_CORE_READ(parent, tgid);
+               if (is_init_process(ppid))
+                       break;
+               ancestors_data->ancestor_pids[num_ancestors] = ppid;
+               ancestors_data->ancestor_exec_ids[num_ancestors] =
+                       BPF_CORE_READ(parent, self_exec_id);
+               ancestors_data->ancestor_start_times[num_ancestors] =
+                       BPF_CORE_READ(parent, start_time);
+               ancestors_data->num_ancestors = num_ancestors;
+       }
+}
+
+static INLINE void* read_full_cgroup_path(struct kernfs_node* cgroup_node,
+                                         struct kernfs_node* cgroup_root_node,
+                                         void* payload,
+                                         int* root_pos)
+{
+       void* payload_start = payload;
+       size_t filepart_length;
+
+#ifdef UNROLL
+#pragma unroll
+#endif
+       for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) {
+               filepart_length =
+                       bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(cgroup_node, name));
+               if (!cgroup_node)
+                       return payload;
+               if (cgroup_node == cgroup_root_node)
+                       *root_pos = payload - payload_start;
+               if (filepart_length <= MAX_PATH) {
+                       barrier_var(filepart_length);
+                       payload += filepart_length;
+               }
+               cgroup_node = BPF_CORE_READ(cgroup_node, parent);
+       }
+       return payload;
+}
+
+static ino_t get_inode_from_kernfs(struct kernfs_node* node)
+{
+       struct kernfs_node___52* node52 = (void*)node;
+
+       if (bpf_core_field_exists(node52->id.ino)) {
+               barrier_var(node52);
+               return BPF_CORE_READ(node52, id.ino);
+       } else {
+               barrier_var(node);
+               return (u64)BPF_CORE_READ(node, id);
+       }
+}
+
+int pids_cgrp_id = 1;
+
+static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data,
+                                        struct task_struct* task,
+                                        void* payload)
+{
+       struct kernfs_node* root_kernfs =
+               BPF_CORE_READ(task, nsproxy, cgroup_ns, root_cset, dfl_cgrp, kn);
+       struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn);
+
+       if (ENABLE_CGROUP_V1_RESOLVER) {
+#ifdef UNROLL
+#pragma unroll
+#endif
+               for (int i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                       struct cgroup_subsys_state* subsys =
+                               BPF_CORE_READ(task, cgroups, subsys[i]);
+                       if (subsys != NULL) {
+                               int subsys_id = BPF_CORE_READ(subsys, ss, id);
+                               if (subsys_id == pids_cgrp_id) {
+                                       proc_kernfs = BPF_CORE_READ(subsys, cgroup, kn);
+                                       root_kernfs = BPF_CORE_READ(subsys, ss, root, kf_root, kn);
+                                       break;
+                               }
+                       }
+               }
+       }
+
+       cgroup_data->cgroup_root_inode = get_inode_from_kernfs(root_kernfs);
+       cgroup_data->cgroup_proc_inode = get_inode_from_kernfs(proc_kernfs);
+
+       if (bpf_core_field_exists(root_kernfs->iattr->ia_mtime)) {
+               cgroup_data->cgroup_root_mtime =
+                       BPF_CORE_READ(root_kernfs, iattr, ia_mtime.tv_nsec);
+               cgroup_data->cgroup_proc_mtime =
+                       BPF_CORE_READ(proc_kernfs, iattr, ia_mtime.tv_nsec);
+       } else {
+               struct kernfs_iattrs___52* root_iattr =
+                       (struct kernfs_iattrs___52*)BPF_CORE_READ(root_kernfs, iattr);
+               cgroup_data->cgroup_root_mtime =
+                       BPF_CORE_READ(root_iattr, ia_iattr.ia_mtime.tv_nsec);
+
+               struct kernfs_iattrs___52* proc_iattr =
+                       (struct kernfs_iattrs___52*)BPF_CORE_READ(proc_kernfs, iattr);
+               cgroup_data->cgroup_proc_mtime =
+                       BPF_CORE_READ(proc_iattr, ia_iattr.ia_mtime.tv_nsec);
+       }
+
+       cgroup_data->cgroup_root_length = 0;
+       cgroup_data->cgroup_proc_length = 0;
+       cgroup_data->cgroup_full_length = 0;
+
+       size_t cgroup_root_length =
+               bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(root_kernfs, name));
+       barrier_var(cgroup_root_length);
+       if (cgroup_root_length <= MAX_PATH) {
+               barrier_var(cgroup_root_length);
+               cgroup_data->cgroup_root_length = cgroup_root_length;
+               payload += cgroup_root_length;
+       }
+
+       size_t cgroup_proc_length =
+               bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(proc_kernfs, name));
+       barrier_var(cgroup_proc_length);
+       if (cgroup_proc_length <= MAX_PATH) {
+               barrier_var(cgroup_proc_length);
+               cgroup_data->cgroup_proc_length = cgroup_proc_length;
+               payload += cgroup_proc_length;
+       }
+
+       if (FETCH_CGROUPS_FROM_BPF) {
+               cgroup_data->cgroup_full_path_root_pos = -1;
+               void* payload_end_pos = read_full_cgroup_path(proc_kernfs, root_kernfs, payload,
+                                                             &cgroup_data->cgroup_full_path_root_pos);
+               cgroup_data->cgroup_full_length = payload_end_pos - payload;
+               payload = payload_end_pos;
+       }
+
+       return (void*)payload;
+}
+
+static INLINE void* populate_var_metadata(struct var_metadata_t* metadata,
+                                         struct task_struct* task,
+                                         u32 pid, void* payload)
+{
+       u64 uid_gid = bpf_get_current_uid_gid();
+
+       metadata->uid = (u32)uid_gid;
+       metadata->gid = uid_gid >> 32;
+       metadata->pid = pid;
+       metadata->exec_id = BPF_CORE_READ(task, self_exec_id);
+       metadata->start_time = BPF_CORE_READ(task, start_time);
+       metadata->comm_length = 0;
+
+       size_t comm_length = bpf_core_read_str(payload, TASK_COMM_LEN, &task->comm);
+       barrier_var(comm_length);
+       if (comm_length <= TASK_COMM_LEN) {
+               barrier_var(comm_length);
+               metadata->comm_length = comm_length;
+               payload += comm_length;
+       }
+
+       return (void*)payload;
+}
+
+static INLINE struct var_kill_data_t*
+get_var_kill_data(struct pt_regs* ctx, int spid, int tpid, int sig)
+{
+       int zero = 0;
+       struct var_kill_data_t* kill_data = bpf_map_lookup_elem(&data_heap, &zero);
+
+       if (kill_data == NULL)
+               return NULL;
+       struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+
+       void* payload = populate_var_metadata(&kill_data->meta, task, spid, kill_data->payload);
+       payload = populate_cgroup_info(&kill_data->cgroup_data, task, payload);
+       size_t payload_length = payload - (void*)kill_data->payload;
+       kill_data->payload_length = payload_length;
+       populate_ancestors(task, &kill_data->ancestors_info);
+       kill_data->meta.type = KILL_EVENT;
+       kill_data->kill_target_pid = tpid;
+       kill_data->kill_sig = sig;
+       kill_data->kill_count = 1;
+       kill_data->last_kill_time = bpf_ktime_get_ns();
+       return kill_data;
+}
+
+static INLINE int trace_var_sys_kill(void* ctx, int tpid, int sig)
+{
+       if ((KILL_SIGNALS & (1ULL << sig)) == 0)
+               return 0;
+
+       u32 spid = get_userspace_pid();
+       struct var_kill_data_arr_t* arr_struct = bpf_map_lookup_elem(&var_tpid_to_data, &tpid);
+
+       if (arr_struct == NULL) {
+               struct var_kill_data_t* kill_data = get_var_kill_data(ctx, spid, tpid, sig);
+               int zero = 0;
+
+               if (kill_data == NULL)
+                       return 0;
+               arr_struct = bpf_map_lookup_elem(&data_heap, &zero);
+               if (arr_struct == NULL)
+                       return 0;
+               bpf_probe_read(&arr_struct->array[0], sizeof(arr_struct->array[0]), kill_data);
+       } else {
+               int index = get_var_spid_index(arr_struct, spid);
+
+               if (index == -1) {
+                       struct var_kill_data_t* kill_data =
+                               get_var_kill_data(ctx, spid, tpid, sig);
+                       if (kill_data == NULL)
+                               return 0;
+#ifdef UNROLL
+#pragma unroll
+#endif
+                       for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++)
+                               if (arr_struct->array[i].meta.pid == 0) {
+                                       bpf_probe_read(&arr_struct->array[i],
+                                                      sizeof(arr_struct->array[i]), kill_data);
+                                       bpf_map_update_elem(&var_tpid_to_data, &tpid,
+                                                           arr_struct, 0);
+
+                                       return 0;
+                               }
+                       return 0;
+               }
+
+               struct var_kill_data_t* kill_data = &arr_struct->array[index];
+
+               u64 delta_sec =
+                       (bpf_ktime_get_ns() - kill_data->last_kill_time) / 1000000000;
+
+               if (delta_sec < STALE_INFO) {
+                       kill_data->kill_count++;
+                       kill_data->last_kill_time = bpf_ktime_get_ns();
+                       bpf_probe_read(&arr_struct->array[index],
+                                      sizeof(arr_struct->array[index]),
+                                      kill_data);
+               } else {
+                       struct var_kill_data_t* kill_data =
+                               get_var_kill_data(ctx, spid, tpid, sig);
+                       if (kill_data == NULL)
+                               return 0;
+                       bpf_probe_read(&arr_struct->array[index],
+                                      sizeof(arr_struct->array[index]),
+                                      kill_data);
+               }
+       }
+       bpf_map_update_elem(&var_tpid_to_data, &tpid, arr_struct, 0);
+       return 0;
+}
+
+static INLINE void bpf_stats_enter(struct bpf_func_stats_ctx* bpf_stat_ctx,
+                                  enum bpf_function_id func_id)
+{
+       int func_id_key = func_id;
+
+       bpf_stat_ctx->start_time_ns = bpf_ktime_get_ns();
+       bpf_stat_ctx->bpf_func_stats_data_val =
+               bpf_map_lookup_elem(&bpf_func_stats, &func_id_key);
+       if (bpf_stat_ctx->bpf_func_stats_data_val)
+               bpf_stat_ctx->bpf_func_stats_data_val->num_executions++;
+}
+
+static INLINE void bpf_stats_exit(struct bpf_func_stats_ctx* bpf_stat_ctx)
+{
+       if (bpf_stat_ctx->bpf_func_stats_data_val)
+               bpf_stat_ctx->bpf_func_stats_data_val->time_elapsed_ns +=
+                       bpf_ktime_get_ns() - bpf_stat_ctx->start_time_ns;
+}
+
+static INLINE void
+bpf_stats_pre_submit_var_perf_event(struct bpf_func_stats_ctx* bpf_stat_ctx,
+                                   struct var_metadata_t* meta)
+{
+       if (bpf_stat_ctx->bpf_func_stats_data_val) {
+               bpf_stat_ctx->bpf_func_stats_data_val->num_perf_events++;
+               meta->bpf_stats_num_perf_events =
+                       bpf_stat_ctx->bpf_func_stats_data_val->num_perf_events;
+       }
+       meta->bpf_stats_start_ktime_ns = bpf_stat_ctx->start_time_ns;
+       meta->cpu_id = bpf_get_smp_processor_id();
+}
+
+static INLINE size_t
+read_absolute_file_path_from_dentry(struct dentry* filp_dentry, void* payload)
+{
+       size_t length = 0;
+       size_t filepart_length;
+       struct dentry* parent_dentry;
+
+#ifdef UNROLL
+#pragma unroll
+#endif
+       for (int i = 0; i < MAX_PATH_DEPTH; i++) {
+               filepart_length = bpf_probe_read_str(payload, MAX_PATH,
+                                                    BPF_CORE_READ(filp_dentry, d_name.name));
+               barrier_var(filepart_length);
+               if (filepart_length > MAX_PATH)
+                       break;
+               barrier_var(filepart_length);
+               payload += filepart_length;
+               length += filepart_length;
+
+               parent_dentry = BPF_CORE_READ(filp_dentry, d_parent);
+               if (filp_dentry == parent_dentry)
+                       break;
+               filp_dentry = parent_dentry;
+       }
+
+       return length;
+}
+
+static INLINE bool
+is_ancestor_in_allowed_inodes(struct dentry* filp_dentry)
+{
+       struct dentry* parent_dentry;
+#ifdef UNROLL
+#pragma unroll
+#endif
+       for (int i = 0; i < MAX_PATH_DEPTH; i++) {
+               u64 dir_ino = BPF_CORE_READ(filp_dentry, d_inode, i_ino);
+               bool* allowed_dir = bpf_map_lookup_elem(&allowed_directory_inodes, &dir_ino);
+
+               if (allowed_dir != NULL)
+                       return true;
+               parent_dentry = BPF_CORE_READ(filp_dentry, d_parent);
+               if (filp_dentry == parent_dentry)
+                       break;
+               filp_dentry = parent_dentry;
+       }
+       return false;
+}
+
+static INLINE bool is_dentry_allowed_for_filemod(struct dentry* file_dentry,
+                                                u32* device_id,
+                                                u64* file_ino)
+{
+       u32 dev_id = BPF_CORE_READ(file_dentry, d_sb, s_dev);
+       *device_id = dev_id;
+       bool* allowed_device = bpf_map_lookup_elem(&allowed_devices, &dev_id);
+
+       if (allowed_device == NULL)
+               return false;
+
+       u64 ino = BPF_CORE_READ(file_dentry, d_inode, i_ino);
+       *file_ino = ino;
+       bool* allowed_file = bpf_map_lookup_elem(&allowed_file_inodes, &ino);
+
+       if (allowed_file == NULL)
+               if (!is_ancestor_in_allowed_inodes(BPF_CORE_READ(file_dentry, d_parent)))
+                       return false;
+       return true;
+}
+
+SEC("kprobe/proc_sys_write")
+ssize_t BPF_KPROBE(kprobe__proc_sys_write,
+                  struct file* filp, const char* buf,
+                  size_t count, loff_t* ppos)
+{
+       struct bpf_func_stats_ctx stats_ctx;
+       bpf_stats_enter(&stats_ctx, profiler_bpf_proc_sys_write);
+
+       u32 pid = get_userspace_pid();
+       int zero = 0;
+       struct var_sysctl_data_t* sysctl_data =
+               bpf_map_lookup_elem(&data_heap, &zero);
+       if (!sysctl_data)
+               goto out;
+
+       struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+       sysctl_data->meta.type = SYSCTL_EVENT;
+       void* payload = populate_var_metadata(&sysctl_data->meta, task, pid, sysctl_data->payload);
+       payload = populate_cgroup_info(&sysctl_data->cgroup_data, task, payload);
+
+       populate_ancestors(task, &sysctl_data->ancestors_info);
+
+       sysctl_data->sysctl_val_length = 0;
+       sysctl_data->sysctl_path_length = 0;
+
+       size_t sysctl_val_length = bpf_probe_read_str(payload, CTL_MAXNAME, buf);
+       barrier_var(sysctl_val_length);
+       if (sysctl_val_length <= CTL_MAXNAME) {
+               barrier_var(sysctl_val_length);
+               sysctl_data->sysctl_val_length = sysctl_val_length;
+               payload += sysctl_val_length;
+       }
+
+       size_t sysctl_path_length = bpf_probe_read_str(payload, MAX_PATH,
+                                                      BPF_CORE_READ(filp, f_path.dentry, d_name.name));
+       barrier_var(sysctl_path_length);
+       if (sysctl_path_length <= MAX_PATH) {
+               barrier_var(sysctl_path_length);
+               sysctl_data->sysctl_path_length = sysctl_path_length;
+               payload += sysctl_path_length;
+       }
+
+       bpf_stats_pre_submit_var_perf_event(&stats_ctx, &sysctl_data->meta);
+       unsigned long data_len = payload - (void*)sysctl_data;
+       data_len = data_len > sizeof(struct var_sysctl_data_t)
+               ? sizeof(struct var_sysctl_data_t)
+               : data_len;
+       bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, sysctl_data, data_len);
+out:
+       bpf_stats_exit(&stats_ctx);
+       return 0;
+}
+
+SEC("tracepoint/syscalls/sys_enter_kill")
+int tracepoint__syscalls__sys_enter_kill(struct trace_event_raw_sys_enter* ctx)
+{
+       struct bpf_func_stats_ctx stats_ctx;
+
+       bpf_stats_enter(&stats_ctx, profiler_bpf_sys_enter_kill);
+       int pid = ctx->args[0];
+       int sig = ctx->args[1];
+       int ret = trace_var_sys_kill(ctx, pid, sig);
+       bpf_stats_exit(&stats_ctx);
+       return ret;
+};
+
+SEC("raw_tracepoint/sched_process_exit")
+int raw_tracepoint__sched_process_exit(void* ctx)
+{
+       int zero = 0;
+       struct bpf_func_stats_ctx stats_ctx;
+       bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_exit);
+
+       u32 tpid = get_userspace_pid();
+
+       struct var_kill_data_arr_t* arr_struct = bpf_map_lookup_elem(&var_tpid_to_data, &tpid);
+       struct var_kill_data_t* kill_data = bpf_map_lookup_elem(&data_heap, &zero);
+
+       if (arr_struct == NULL || kill_data == NULL)
+               goto out;
+
+       struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+       struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn);
+
+#ifdef UNROLL
+#pragma unroll
+#endif
+       for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) {
+               struct var_kill_data_t* past_kill_data = &arr_struct->array[i];
+
+               if (past_kill_data != NULL && past_kill_data->kill_target_pid == tpid) {
+                       bpf_probe_read(kill_data, sizeof(*past_kill_data), past_kill_data);
+                       void* payload = kill_data->payload;
+                       size_t offset = kill_data->payload_length;
+                       if (offset >= MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN)
+                               return 0;
+                       payload += offset;
+
+                       kill_data->kill_target_name_length = 0;
+                       kill_data->kill_target_cgroup_proc_length = 0;
+
+                       size_t comm_length = bpf_core_read_str(payload, TASK_COMM_LEN, &task->comm);
+                       barrier_var(comm_length);
+                       if (comm_length <= TASK_COMM_LEN) {
+                               barrier_var(comm_length);
+                               kill_data->kill_target_name_length = comm_length;
+                               payload += comm_length;
+                       }
+
+                       size_t cgroup_proc_length = bpf_probe_read_str(payload, KILL_TARGET_LEN,
+                                                                      BPF_CORE_READ(proc_kernfs, name));
+                       barrier_var(cgroup_proc_length);
+                       if (cgroup_proc_length <= KILL_TARGET_LEN) {
+                               barrier_var(cgroup_proc_length);
+                               kill_data->kill_target_cgroup_proc_length = cgroup_proc_length;
+                               payload += cgroup_proc_length;
+                       }
+
+                       bpf_stats_pre_submit_var_perf_event(&stats_ctx, &kill_data->meta);
+                       unsigned long data_len = (void*)payload - (void*)kill_data;
+                       data_len = data_len > sizeof(struct var_kill_data_t)
+                               ? sizeof(struct var_kill_data_t)
+                               : data_len;
+                       bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, kill_data, data_len);
+               }
+       }
+       bpf_map_delete_elem(&var_tpid_to_data, &tpid);
+out:
+       bpf_stats_exit(&stats_ctx);
+       return 0;
+}
+
+SEC("raw_tracepoint/sched_process_exec")
+int raw_tracepoint__sched_process_exec(struct bpf_raw_tracepoint_args* ctx)
+{
+       struct bpf_func_stats_ctx stats_ctx;
+       bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_exec);
+
+       struct linux_binprm* bprm = (struct linux_binprm*)ctx->args[2];
+       u64 inode = BPF_CORE_READ(bprm, file, f_inode, i_ino);
+
+       bool* should_filter_binprm = bpf_map_lookup_elem(&disallowed_exec_inodes, &inode);
+       if (should_filter_binprm != NULL)
+               goto out;
+
+       int zero = 0;
+       struct var_exec_data_t* proc_exec_data = bpf_map_lookup_elem(&data_heap, &zero);
+       if (!proc_exec_data)
+               goto out;
+
+       if (INODE_FILTER && inode != INODE_FILTER)
+               return 0;
+
+       u32 pid = get_userspace_pid();
+       struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+
+       proc_exec_data->meta.type = EXEC_EVENT;
+       proc_exec_data->bin_path_length = 0;
+       proc_exec_data->cmdline_length = 0;
+       proc_exec_data->environment_length = 0;
+       void* payload = populate_var_metadata(&proc_exec_data->meta, task, pid,
+                                             proc_exec_data->payload);
+       payload = populate_cgroup_info(&proc_exec_data->cgroup_data, task, payload);
+
+       struct task_struct* parent_task = BPF_CORE_READ(task, real_parent);
+       proc_exec_data->parent_pid = BPF_CORE_READ(parent_task, tgid);
+       proc_exec_data->parent_uid = BPF_CORE_READ(parent_task, real_cred, uid.val);
+       proc_exec_data->parent_exec_id = BPF_CORE_READ(parent_task, self_exec_id);
+       proc_exec_data->parent_start_time = BPF_CORE_READ(parent_task, start_time);
+
+       const char* filename = BPF_CORE_READ(bprm, filename);
+       size_t bin_path_length = bpf_probe_read_str(payload, MAX_FILENAME_LEN, filename);
+       barrier_var(bin_path_length);
+       if (bin_path_length <= MAX_FILENAME_LEN) {
+               barrier_var(bin_path_length);
+               proc_exec_data->bin_path_length = bin_path_length;
+               payload += bin_path_length;
+       }
+
+       void* arg_start = (void*)BPF_CORE_READ(task, mm, arg_start);
+       void* arg_end = (void*)BPF_CORE_READ(task, mm, arg_end);
+       unsigned int cmdline_length = probe_read_lim(payload, arg_start,
+                                                    arg_end - arg_start, MAX_ARGS_LEN);
+
+       if (cmdline_length <= MAX_ARGS_LEN) {
+               barrier_var(cmdline_length);
+               proc_exec_data->cmdline_length = cmdline_length;
+               payload += cmdline_length;
+       }
+
+       if (READ_ENVIRON_FROM_EXEC) {
+               void* env_start = (void*)BPF_CORE_READ(task, mm, env_start);
+               void* env_end = (void*)BPF_CORE_READ(task, mm, env_end);
+               unsigned long env_len = probe_read_lim(payload, env_start,
+                                                      env_end - env_start, MAX_ENVIRON_LEN);
+               if (cmdline_length <= MAX_ENVIRON_LEN) {
+                       proc_exec_data->environment_length = env_len;
+                       payload += env_len;
+               }
+       }
+
+       bpf_stats_pre_submit_var_perf_event(&stats_ctx, &proc_exec_data->meta);
+       unsigned long data_len = payload - (void*)proc_exec_data;
+       data_len = data_len > sizeof(struct var_exec_data_t)
+               ? sizeof(struct var_exec_data_t)
+               : data_len;
+       bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, proc_exec_data, data_len);
+out:
+       bpf_stats_exit(&stats_ctx);
+       return 0;
+}
+
+SEC("kretprobe/do_filp_open")
+int kprobe_ret__do_filp_open(struct pt_regs* ctx)
+{
+       struct bpf_func_stats_ctx stats_ctx;
+       bpf_stats_enter(&stats_ctx, profiler_bpf_do_filp_open_ret);
+
+       struct file* filp = (struct file*)PT_REGS_RC_CORE(ctx);
+
+       if (filp == NULL || IS_ERR(filp))
+               goto out;
+       unsigned int flags = BPF_CORE_READ(filp, f_flags);
+       if ((flags & (O_RDWR | O_WRONLY)) == 0)
+               goto out;
+       if ((flags & O_TMPFILE) > 0)
+               goto out;
+       struct inode* file_inode = BPF_CORE_READ(filp, f_inode);
+       umode_t mode = BPF_CORE_READ(file_inode, i_mode);
+       if (S_ISDIR(mode) || S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) ||
+           S_ISSOCK(mode))
+               goto out;
+
+       struct dentry* filp_dentry = BPF_CORE_READ(filp, f_path.dentry);
+       u32 device_id = 0;
+       u64 file_ino = 0;
+       if (!is_dentry_allowed_for_filemod(filp_dentry, &device_id, &file_ino))
+               goto out;
+
+       int zero = 0;
+       struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero);
+       if (!filemod_data)
+               goto out;
+
+       u32 pid = get_userspace_pid();
+       struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+
+       filemod_data->meta.type = FILEMOD_EVENT;
+       filemod_data->fmod_type = FMOD_OPEN;
+       filemod_data->dst_flags = flags;
+       filemod_data->src_inode = 0;
+       filemod_data->dst_inode = file_ino;
+       filemod_data->src_device_id = 0;
+       filemod_data->dst_device_id = device_id;
+       filemod_data->src_filepath_length = 0;
+       filemod_data->dst_filepath_length = 0;
+
+       void* payload = populate_var_metadata(&filemod_data->meta, task, pid,
+                                             filemod_data->payload);
+       payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload);
+
+       size_t len = read_absolute_file_path_from_dentry(filp_dentry, payload);
+       barrier_var(len);
+       if (len <= MAX_FILEPATH_LENGTH) {
+               barrier_var(len);
+               payload += len;
+               filemod_data->dst_filepath_length = len;
+       }
+       bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta);
+       unsigned long data_len = payload - (void*)filemod_data;
+       data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len;
+       bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len);
+out:
+       bpf_stats_exit(&stats_ctx);
+       return 0;
+}
+
+SEC("kprobe/vfs_link")
+int BPF_KPROBE(kprobe__vfs_link,
+              struct dentry* old_dentry, struct inode* dir,
+              struct dentry* new_dentry, struct inode** delegated_inode)
+{
+       struct bpf_func_stats_ctx stats_ctx;
+       bpf_stats_enter(&stats_ctx, profiler_bpf_vfs_link);
+
+       u32 src_device_id = 0;
+       u64 src_file_ino = 0;
+       u32 dst_device_id = 0;
+       u64 dst_file_ino = 0;
+       if (!is_dentry_allowed_for_filemod(old_dentry, &src_device_id, &src_file_ino) &&
+           !is_dentry_allowed_for_filemod(new_dentry, &dst_device_id, &dst_file_ino))
+               goto out;
+
+       int zero = 0;
+       struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero);
+       if (!filemod_data)
+               goto out;
+
+       u32 pid = get_userspace_pid();
+       struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+
+       filemod_data->meta.type = FILEMOD_EVENT;
+       filemod_data->fmod_type = FMOD_LINK;
+       filemod_data->dst_flags = 0;
+       filemod_data->src_inode = src_file_ino;
+       filemod_data->dst_inode = dst_file_ino;
+       filemod_data->src_device_id = src_device_id;
+       filemod_data->dst_device_id = dst_device_id;
+       filemod_data->src_filepath_length = 0;
+       filemod_data->dst_filepath_length = 0;
+
+       void* payload = populate_var_metadata(&filemod_data->meta, task, pid,
+                                             filemod_data->payload);
+       payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload);
+
+       size_t len = read_absolute_file_path_from_dentry(old_dentry, payload);
+       barrier_var(len);
+       if (len <= MAX_FILEPATH_LENGTH) {
+               barrier_var(len);
+               payload += len;
+               filemod_data->src_filepath_length = len;
+       }
+
+       len = read_absolute_file_path_from_dentry(new_dentry, payload);
+       barrier_var(len);
+       if (len <= MAX_FILEPATH_LENGTH) {
+               barrier_var(len);
+               payload += len;
+               filemod_data->dst_filepath_length = len;
+       }
+
+       bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta);
+       unsigned long data_len = payload - (void*)filemod_data;
+       data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len;
+       bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len);
+out:
+       bpf_stats_exit(&stats_ctx);
+       return 0;
+}
+
+SEC("kprobe/vfs_symlink")
+int BPF_KPROBE(kprobe__vfs_symlink, struct inode* dir, struct dentry* dentry,
+              const char* oldname)
+{
+       struct bpf_func_stats_ctx stats_ctx;
+       bpf_stats_enter(&stats_ctx, profiler_bpf_vfs_symlink);
+
+       u32 dst_device_id = 0;
+       u64 dst_file_ino = 0;
+       if (!is_dentry_allowed_for_filemod(dentry, &dst_device_id, &dst_file_ino))
+               goto out;
+
+       int zero = 0;
+       struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero);
+       if (!filemod_data)
+               goto out;
+
+       u32 pid = get_userspace_pid();
+       struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+
+       filemod_data->meta.type = FILEMOD_EVENT;
+       filemod_data->fmod_type = FMOD_SYMLINK;
+       filemod_data->dst_flags = 0;
+       filemod_data->src_inode = 0;
+       filemod_data->dst_inode = dst_file_ino;
+       filemod_data->src_device_id = 0;
+       filemod_data->dst_device_id = dst_device_id;
+       filemod_data->src_filepath_length = 0;
+       filemod_data->dst_filepath_length = 0;
+
+       void* payload = populate_var_metadata(&filemod_data->meta, task, pid,
+                                             filemod_data->payload);
+       payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload);
+
+       size_t len = bpf_probe_read_str(payload, MAX_FILEPATH_LENGTH, oldname);
+       barrier_var(len);
+       if (len <= MAX_FILEPATH_LENGTH) {
+               barrier_var(len);
+               payload += len;
+               filemod_data->src_filepath_length = len;
+       }
+       len = read_absolute_file_path_from_dentry(dentry, payload);
+       barrier_var(len);
+       if (len <= MAX_FILEPATH_LENGTH) {
+               barrier_var(len);
+               payload += len;
+               filemod_data->dst_filepath_length = len;
+       }
+       bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta);
+       unsigned long data_len = payload - (void*)filemod_data;
+       data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len;
+       bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len);
+out:
+       bpf_stats_exit(&stats_ctx);
+       return 0;
+}
+
+SEC("raw_tracepoint/sched_process_fork")
+int raw_tracepoint__sched_process_fork(struct bpf_raw_tracepoint_args* ctx)
+{
+       struct bpf_func_stats_ctx stats_ctx;
+       bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_fork);
+
+       int zero = 0;
+       struct var_fork_data_t* fork_data = bpf_map_lookup_elem(&data_heap, &zero);
+       if (!fork_data)
+               goto out;
+
+       struct task_struct* parent = (struct task_struct*)ctx->args[0];
+       struct task_struct* child = (struct task_struct*)ctx->args[1];
+       fork_data->meta.type = FORK_EVENT;
+
+       void* payload = populate_var_metadata(&fork_data->meta, child,
+                                             BPF_CORE_READ(child, pid), fork_data->payload);
+       fork_data->parent_pid = BPF_CORE_READ(parent, pid);
+       fork_data->parent_exec_id = BPF_CORE_READ(parent, self_exec_id);
+       fork_data->parent_start_time = BPF_CORE_READ(parent, start_time);
+       bpf_stats_pre_submit_var_perf_event(&stats_ctx, &fork_data->meta);
+
+       unsigned long data_len = payload - (void*)fork_data;
+       data_len = data_len > sizeof(*fork_data) ? sizeof(*fork_data) : data_len;
+       bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, fork_data, data_len);
+out:
+       bpf_stats_exit(&stats_ctx);
+       return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/profiler1.c b/tools/testing/selftests/bpf/progs/profiler1.c
new file mode 100644 (file)
index 0000000..4df9088
--- /dev/null
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define barrier_var(var) asm volatile("" : "=r"(var) : "0"(var))
+#define UNROLL
+#define INLINE __always_inline
+#include "profiler.inc.h"
diff --git a/tools/testing/selftests/bpf/progs/profiler2.c b/tools/testing/selftests/bpf/progs/profiler2.c
new file mode 100644 (file)
index 0000000..0f32a3c
--- /dev/null
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define barrier_var(var) /**/
+/* undef #define UNROLL */
+#define INLINE /**/
+#include "profiler.inc.h"
diff --git a/tools/testing/selftests/bpf/progs/profiler3.c b/tools/testing/selftests/bpf/progs/profiler3.c
new file mode 100644 (file)
index 0000000..6249fc3
--- /dev/null
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define barrier_var(var) /**/
+#define UNROLL
+#define INLINE __noinline
+#include "profiler.inc.h"
index 193fe01..c1e0c8c 100644 (file)
@@ -41,6 +41,43 @@ struct outer_arr {
        .values = { (void *)&inner_map1, 0, (void *)&inner_map2 },
 };
 
+struct inner_map_sz3 {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __uint(map_flags, BPF_F_INNER_MAP);
+       __uint(max_entries, 3);
+       __type(key, int);
+       __type(value, int);
+} inner_map3 SEC(".maps"),
+  inner_map4 SEC(".maps");
+
+struct inner_map_sz4 {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __uint(map_flags, BPF_F_INNER_MAP);
+       __uint(max_entries, 5);
+       __type(key, int);
+       __type(value, int);
+} inner_map5 SEC(".maps");
+
+struct outer_arr_dyn {
+       __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+       __uint(max_entries, 3);
+       __uint(key_size, sizeof(int));
+       __uint(value_size, sizeof(int));
+       __array(values, struct {
+               __uint(type, BPF_MAP_TYPE_ARRAY);
+               __uint(map_flags, BPF_F_INNER_MAP);
+               __uint(max_entries, 1);
+               __type(key, int);
+               __type(value, int);
+       });
+} outer_arr_dyn SEC(".maps") = {
+       .values = {
+               [0] = (void *)&inner_map3,
+               [1] = (void *)&inner_map4,
+               [2] = (void *)&inner_map5,
+       },
+};
+
 struct outer_hash {
        __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
        __uint(max_entries, 5);
@@ -101,6 +138,12 @@ int handle__sys_enter(void *ctx)
        val = input + 1;
        bpf_map_update_elem(inner_map, &key, &val, 0);
 
+       inner_map = bpf_map_lookup_elem(&outer_arr_dyn, &key);
+       if (!inner_map)
+               return 1;
+       val = input + 2;
+       bpf_map_update_elem(inner_map, &key, &val, 0);
+
        return 0;
 }
 
diff --git a/tools/testing/selftests/bpf/progs/test_core_autosize.c b/tools/testing/selftests/bpf/progs/test_core_autosize.c
new file mode 100644 (file)
index 0000000..44f5aa2
--- /dev/null
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+/* fields of exactly the same size */
+struct test_struct___samesize {
+       void *ptr;
+       unsigned long long val1;
+       unsigned int val2;
+       unsigned short val3;
+       unsigned char val4;
+} __attribute((preserve_access_index));
+
+/* unsigned fields that have to be downsized by libbpf */
+struct test_struct___downsize {
+       void *ptr;
+       unsigned long val1;
+       unsigned long val2;
+       unsigned long val3;
+       unsigned long val4;
+       /* total sz: 40 */
+} __attribute__((preserve_access_index));
+
+/* fields with signed integers of wrong size, should be rejected */
+struct test_struct___signed {
+       void *ptr;
+       long val1;
+       long val2;
+       long val3;
+       long val4;
+} __attribute((preserve_access_index));
+
+/* real layout and sizes according to test's (32-bit) BTF */
+struct test_struct___real {
+       unsigned int ptr; /* can't use `void *`, it is always 8 byte in BPF target */
+       unsigned int val2;
+       unsigned long long val1;
+       unsigned short val3;
+       unsigned char val4;
+       unsigned char _pad;
+       /* total sz: 20 */
+};
+
+struct test_struct___real input = {
+       .ptr = 0x01020304,
+       .val1 = 0x1020304050607080,
+       .val2 = 0x0a0b0c0d,
+       .val3 = 0xfeed,
+       .val4 = 0xb9,
+       ._pad = 0xff, /* make sure no accidental zeros are present */
+};
+
+unsigned long long ptr_samesized = 0;
+unsigned long long val1_samesized = 0;
+unsigned long long val2_samesized = 0;
+unsigned long long val3_samesized = 0;
+unsigned long long val4_samesized = 0;
+struct test_struct___real output_samesized = {};
+
+unsigned long long ptr_downsized = 0;
+unsigned long long val1_downsized = 0;
+unsigned long long val2_downsized = 0;
+unsigned long long val3_downsized = 0;
+unsigned long long val4_downsized = 0;
+struct test_struct___real output_downsized = {};
+
+unsigned long long ptr_probed = 0;
+unsigned long long val1_probed = 0;
+unsigned long long val2_probed = 0;
+unsigned long long val3_probed = 0;
+unsigned long long val4_probed = 0;
+
+unsigned long long ptr_signed = 0;
+unsigned long long val1_signed = 0;
+unsigned long long val2_signed = 0;
+unsigned long long val3_signed = 0;
+unsigned long long val4_signed = 0;
+struct test_struct___real output_signed = {};
+
+SEC("raw_tp/sys_exit")
+int handle_samesize(void *ctx)
+{
+       struct test_struct___samesize *in = (void *)&input;
+       struct test_struct___samesize *out = (void *)&output_samesized;
+
+       ptr_samesized = (unsigned long long)in->ptr;
+       val1_samesized = in->val1;
+       val2_samesized = in->val2;
+       val3_samesized = in->val3;
+       val4_samesized = in->val4;
+
+       out->ptr = in->ptr;
+       out->val1 = in->val1;
+       out->val2 = in->val2;
+       out->val3 = in->val3;
+       out->val4 = in->val4;
+
+       return 0;
+}
+
+SEC("raw_tp/sys_exit")
+int handle_downsize(void *ctx)
+{
+       struct test_struct___downsize *in = (void *)&input;
+       struct test_struct___downsize *out = (void *)&output_downsized;
+
+       ptr_downsized = (unsigned long long)in->ptr;
+       val1_downsized = in->val1;
+       val2_downsized = in->val2;
+       val3_downsized = in->val3;
+       val4_downsized = in->val4;
+
+       out->ptr = in->ptr;
+       out->val1 = in->val1;
+       out->val2 = in->val2;
+       out->val3 = in->val3;
+       out->val4 = in->val4;
+
+       return 0;
+}
+
+SEC("raw_tp/sys_enter")
+int handle_probed(void *ctx)
+{
+       struct test_struct___downsize *in = (void *)&input;
+       __u64 tmp;
+
+       tmp = 0;
+       bpf_core_read(&tmp, bpf_core_field_size(in->ptr), &in->ptr);
+       ptr_probed = tmp;
+
+       tmp = 0;
+       bpf_core_read(&tmp, bpf_core_field_size(in->val1), &in->val1);
+       val1_probed = tmp;
+
+       tmp = 0;
+       bpf_core_read(&tmp, bpf_core_field_size(in->val2), &in->val2);
+       val2_probed = tmp;
+
+       tmp = 0;
+       bpf_core_read(&tmp, bpf_core_field_size(in->val3), &in->val3);
+       val3_probed = tmp;
+
+       tmp = 0;
+       bpf_core_read(&tmp, bpf_core_field_size(in->val4), &in->val4);
+       val4_probed = tmp;
+
+       return 0;
+}
+
+SEC("raw_tp/sys_enter")
+int handle_signed(void *ctx)
+{
+       struct test_struct___signed *in = (void *)&input;
+       struct test_struct___signed *out = (void *)&output_signed;
+
+       val2_signed = in->val2;
+       val3_signed = in->val3;
+       val4_signed = in->val4;
+
+       out->val2= in->val2;
+       out->val3= in->val3;
+       out->val4= in->val4;
+
+       return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf.c
new file mode 100644 (file)
index 0000000..bb8ea92
--- /dev/null
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Google */
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+
+__u64 out__runqueues_addr = -1;
+__u64 out__bpf_prog_active_addr = -1;
+
+__u32 out__rq_cpu = -1; /* percpu struct fields */
+int out__bpf_prog_active = -1; /* percpu int */
+
+__u32 out__this_rq_cpu = -1;
+int out__this_bpf_prog_active = -1;
+
+__u32 out__cpu_0_rq_cpu = -1; /* cpu_rq(0)->cpu */
+
+extern const struct rq runqueues __ksym; /* struct type global var. */
+extern const int bpf_prog_active __ksym; /* int type global var. */
+
+SEC("raw_tp/sys_enter")
+int handler(const void *ctx)
+{
+       struct rq *rq;
+       int *active;
+       __u32 cpu;
+
+       out__runqueues_addr = (__u64)&runqueues;
+       out__bpf_prog_active_addr = (__u64)&bpf_prog_active;
+
+       cpu = bpf_get_smp_processor_id();
+
+       /* test bpf_per_cpu_ptr() */
+       rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, cpu);
+       if (rq)
+               out__rq_cpu = rq->cpu;
+       active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu);
+       if (active)
+               out__bpf_prog_active = *active;
+
+       rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, 0);
+       if (rq) /* should always be valid, but we can't spare the check. */
+               out__cpu_0_rq_cpu = rq->cpu;
+
+       /* test bpf_this_cpu_ptr */
+       rq = (struct rq *)bpf_this_cpu_ptr(&runqueues);
+       out__this_rq_cpu = rq->cpu;
+       active = (int *)bpf_this_cpu_ptr(&bpf_prog_active);
+       out__this_bpf_prog_active = *active;
+
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
index 3a216d1..72ec017 100644 (file)
@@ -304,10 +304,10 @@ int misc_estab(struct bpf_sock_ops *skops)
                passive_lport_n = __bpf_htons(passive_lport_h);
                bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN,
                               &true_val, sizeof(true_val));
-               set_hdr_cb_flags(skops);
+               set_hdr_cb_flags(skops, 0);
                break;
        case BPF_SOCK_OPS_TCP_CONNECT_CB:
-               set_hdr_cb_flags(skops);
+               set_hdr_cb_flags(skops, 0);
                break;
        case BPF_SOCK_OPS_PARSE_HDR_OPT_CB:
                return handle_parse_hdr(skops);
index 3dca4c2..1858435 100644 (file)
@@ -131,39 +131,55 @@ int bpf_prog2(struct __sk_buff *skb)
 
 }
 
-SEC("sk_skb3")
-int bpf_prog3(struct __sk_buff *skb)
+static inline void bpf_write_pass(struct __sk_buff *skb, int offset)
 {
-       const int one = 1;
-       int err, *f, ret = SK_PASS;
+       int err = bpf_skb_pull_data(skb, 6 + offset);
        void *data_end;
        char *c;
 
-       err = bpf_skb_pull_data(skb, 19);
        if (err)
-               goto tls_out;
+               return;
 
        c = (char *)(long)skb->data;
        data_end = (void *)(long)skb->data_end;
 
-       if (c + 18 < data_end)
-               memcpy(&c[13], "PASS", 4);
+       if (c + 5 + offset < data_end)
+               memcpy(c + offset, "PASS", 4);
+}
+
+SEC("sk_skb3")
+int bpf_prog3(struct __sk_buff *skb)
+{
+       int err, *f, ret = SK_PASS;
+       const int one = 1;
+
        f = bpf_map_lookup_elem(&sock_skb_opts, &one);
        if (f && *f) {
                __u64 flags = 0;
 
                ret = 0;
                flags = *f;
+
+               err = bpf_skb_adjust_room(skb, -13, 0, 0);
+               if (err)
+                       return SK_DROP;
+               err = bpf_skb_adjust_room(skb, 4, 0, 0);
+               if (err)
+                       return SK_DROP;
+               bpf_write_pass(skb, 0);
 #ifdef SOCKMAP
                return bpf_sk_redirect_map(skb, &tls_sock_map, ret, flags);
 #else
                return bpf_sk_redirect_hash(skb, &tls_sock_map, &ret, flags);
 #endif
        }
-
        f = bpf_map_lookup_elem(&sock_skb_opts, &one);
        if (f && *f)
                ret = SK_DROP;
+       err = bpf_skb_adjust_room(skb, 4, 0, 0);
+       if (err)
+               return SK_DROP;
+       bpf_write_pass(skb, 13);
 tls_out:
        return ret;
 }
index 889a72c..fe18261 100644 (file)
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
 
-#ifndef barrier_data
-# define barrier_data(ptr)     asm volatile("": :"r"(ptr) :"memory")
-#endif
-
 #ifndef ctx_ptr
 # define ctx_ptr(field)                (void *)(long)(field)
 #endif
 
-#define dst_to_src_tmp         0xeeddddeeU
-#define src_to_dst_tmp         0xeeffffeeU
-
 #define ip4_src                        0xac100164 /* 172.16.1.100 */
 #define ip4_dst                        0xac100264 /* 172.16.2.100 */
 
                                 a.s6_addr32[3] == b.s6_addr32[3])
 #endif
 
+enum {
+       dev_src,
+       dev_dst,
+};
+
+struct bpf_map_def SEC("maps") ifindex_map = {
+       .type           = BPF_MAP_TYPE_ARRAY,
+       .key_size       = sizeof(int),
+       .value_size     = sizeof(int),
+       .max_entries    = 2,
+};
+
 static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb,
                                            __be32 addr)
 {
@@ -73,7 +78,14 @@ static __always_inline bool is_remote_ep_v6(struct __sk_buff *skb,
        return v6_equal(ip6h->daddr, addr);
 }
 
-SEC("chk_neigh") int tc_chk(struct __sk_buff *skb)
+static __always_inline int get_dev_ifindex(int which)
+{
+       int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which);
+
+       return ifindex ? *ifindex : 0;
+}
+
+SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
 {
        void *data_end = ctx_ptr(skb->data_end);
        void *data = ctx_ptr(skb->data);
@@ -87,7 +99,6 @@ SEC("chk_neigh") int tc_chk(struct __sk_buff *skb)
 
 SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
 {
-       int idx = dst_to_src_tmp;
        __u8 zero[ETH_ALEN * 2];
        bool redirect = false;
 
@@ -103,19 +114,15 @@ SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
        if (!redirect)
                return TC_ACT_OK;
 
-       barrier_data(&idx);
-       idx = bpf_ntohl(idx);
-
        __builtin_memset(&zero, 0, sizeof(zero));
        if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
                return TC_ACT_SHOT;
 
-       return bpf_redirect_neigh(idx, 0);
+       return bpf_redirect_neigh(get_dev_ifindex(dev_src), 0);
 }
 
 SEC("src_ingress") int tc_src(struct __sk_buff *skb)
 {
-       int idx = src_to_dst_tmp;
        __u8 zero[ETH_ALEN * 2];
        bool redirect = false;
 
@@ -131,14 +138,11 @@ SEC("src_ingress") int tc_src(struct __sk_buff *skb)
        if (!redirect)
                return TC_ACT_OK;
 
-       barrier_data(&idx);
-       idx = bpf_ntohl(idx);
-
        __builtin_memset(&zero, 0, sizeof(zero));
        if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
                return TC_ACT_SHOT;
 
-       return bpf_redirect_neigh(idx, 0);
+       return bpf_redirect_neigh(get_dev_ifindex(dev_dst), 0);
 }
 
 char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_tc_peer.c b/tools/testing/selftests/bpf/progs/test_tc_peer.c
new file mode 100644 (file)
index 0000000..fc84a76
--- /dev/null
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <linux/bpf.h>
+#include <linux/stddef.h>
+#include <linux/pkt_cls.h>
+
+#include <bpf/bpf_helpers.h>
+
+enum {
+       dev_src,
+       dev_dst,
+};
+
+struct bpf_map_def SEC("maps") ifindex_map = {
+       .type           = BPF_MAP_TYPE_ARRAY,
+       .key_size       = sizeof(int),
+       .value_size     = sizeof(int),
+       .max_entries    = 2,
+};
+
+static __always_inline int get_dev_ifindex(int which)
+{
+       int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which);
+
+       return ifindex ? *ifindex : 0;
+}
+
+SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
+{
+       return TC_ACT_SHOT;
+}
+
+SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
+{
+       return bpf_redirect_peer(get_dev_ifindex(dev_src), 0);
+}
+
+SEC("src_ingress") int tc_src(struct __sk_buff *skb)
+{
+       return bpf_redirect_peer(get_dev_ifindex(dev_dst), 0);
+}
+
+char __license[] SEC("license") = "GPL";
index 9197a23..678bd0f 100644 (file)
@@ -21,6 +21,7 @@
 
 __u8 test_kind = TCPOPT_EXP;
 __u16 test_magic = 0xeB9F;
+__u32 inherit_cb_flags = 0;
 
 struct bpf_test_option passive_synack_out = {};
 struct bpf_test_option passive_fin_out = {};
@@ -467,6 +468,8 @@ static int handle_passive_estab(struct bpf_sock_ops *skops)
        struct tcphdr *th;
        int err;
 
+       inherit_cb_flags = skops->bpf_sock_ops_cb_flags;
+
        err = load_option(skops, &passive_estab_in, true);
        if (err == -ENOENT) {
                /* saved_syn is not found. It was in syncookie mode.
@@ -600,10 +603,10 @@ int estab(struct bpf_sock_ops *skops)
        case BPF_SOCK_OPS_TCP_LISTEN_CB:
                bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN,
                               &true_val, sizeof(true_val));
-               set_hdr_cb_flags(skops);
+               set_hdr_cb_flags(skops, BPF_SOCK_OPS_STATE_CB_FLAG);
                break;
        case BPF_SOCK_OPS_TCP_CONNECT_CB:
-               set_hdr_cb_flags(skops);
+               set_hdr_cb_flags(skops, 0);
                break;
        case BPF_SOCK_OPS_PARSE_HDR_OPT_CB:
                return handle_parse_hdr(skops);
index 9b6fb00..0fa1e42 100644 (file)
@@ -86,6 +86,7 @@ int txmsg_ktls_skb_redir;
 int ktls;
 int peek_flag;
 int skb_use_parser;
+int txmsg_omit_skb_parser;
 
 static const struct option long_options[] = {
        {"help",        no_argument,            NULL, 'h' },
@@ -111,6 +112,7 @@ static const struct option long_options[] = {
        {"txmsg_redir_skb", no_argument,        &txmsg_redir_skb, 1 },
        {"ktls", no_argument,                   &ktls, 1 },
        {"peek", no_argument,                   &peek_flag, 1 },
+       {"txmsg_omit_skb_parser", no_argument,      &txmsg_omit_skb_parser, 1},
        {"whitelist", required_argument,        NULL, 'n' },
        {"blacklist", required_argument,        NULL, 'b' },
        {0, 0, NULL, 0 }
@@ -175,6 +177,7 @@ static void test_reset(void)
        txmsg_apply = txmsg_cork = 0;
        txmsg_ingress = txmsg_redir_skb = 0;
        txmsg_ktls_skb = txmsg_ktls_skb_drop = txmsg_ktls_skb_redir = 0;
+       txmsg_omit_skb_parser = 0;
        skb_use_parser = 0;
 }
 
@@ -518,28 +521,13 @@ static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz)
                if (i == 0 && txmsg_ktls_skb) {
                        if (msg->msg_iov[i].iov_len < 4)
                                return -EIO;
-                       if (txmsg_ktls_skb_redir) {
-                               if (memcmp(&d[13], "PASS", 4) != 0) {
-                                       fprintf(stderr,
-                                               "detected redirect ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[13], d[14], d[15], d[16]);
-                                       return -EIO;
-                               }
-                               d[13] = 0;
-                               d[14] = 1;
-                               d[15] = 2;
-                               d[16] = 3;
-                               j = 13;
-                       } else if (txmsg_ktls_skb) {
-                               if (memcmp(d, "PASS", 4) != 0) {
-                                       fprintf(stderr,
-                                               "detected ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[0], d[1], d[2], d[3]);
-                                       return -EIO;
-                               }
-                               d[0] = 0;
-                               d[1] = 1;
-                               d[2] = 2;
-                               d[3] = 3;
+                       if (memcmp(d, "PASS", 4) != 0) {
+                               fprintf(stderr,
+                                       "detected skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n",
+                                       i, 0, d[0], d[1], d[2], d[3]);
+                               return -EIO;
                        }
+                       j = 4; /* advance index past PASS header */
                }
 
                for (; j < msg->msg_iov[i].iov_len && size; j++) {
@@ -927,13 +915,15 @@ static int run_options(struct sockmap_options *options, int cg_fd,  int test)
                goto run;
 
        /* Attach programs to sockmap */
-       err = bpf_prog_attach(prog_fd[0], map_fd[0],
-                               BPF_SK_SKB_STREAM_PARSER, 0);
-       if (err) {
-               fprintf(stderr,
-                       "ERROR: bpf_prog_attach (sockmap %i->%i): %d (%s)\n",
-                       prog_fd[0], map_fd[0], err, strerror(errno));
-               return err;
+       if (!txmsg_omit_skb_parser) {
+               err = bpf_prog_attach(prog_fd[0], map_fd[0],
+                                     BPF_SK_SKB_STREAM_PARSER, 0);
+               if (err) {
+                       fprintf(stderr,
+                               "ERROR: bpf_prog_attach (sockmap %i->%i): %d (%s)\n",
+                               prog_fd[0], map_fd[0], err, strerror(errno));
+                       return err;
+               }
        }
 
        err = bpf_prog_attach(prog_fd[1], map_fd[0],
@@ -946,13 +936,15 @@ static int run_options(struct sockmap_options *options, int cg_fd,  int test)
 
        /* Attach programs to TLS sockmap */
        if (txmsg_ktls_skb) {
-               err = bpf_prog_attach(prog_fd[0], map_fd[8],
-                                       BPF_SK_SKB_STREAM_PARSER, 0);
-               if (err) {
-                       fprintf(stderr,
-                               "ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n",
-                               prog_fd[0], map_fd[8], err, strerror(errno));
-                       return err;
+               if (!txmsg_omit_skb_parser) {
+                       err = bpf_prog_attach(prog_fd[0], map_fd[8],
+                                             BPF_SK_SKB_STREAM_PARSER, 0);
+                       if (err) {
+                               fprintf(stderr,
+                                       "ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n",
+                                       prog_fd[0], map_fd[8], err, strerror(errno));
+                               return err;
+                       }
                }
 
                err = bpf_prog_attach(prog_fd[2], map_fd[8],
@@ -1480,12 +1472,29 @@ static void test_txmsg_skb(int cgrp, struct sockmap_options *opt)
        txmsg_ktls_skb_drop = 0;
        txmsg_ktls_skb_redir = 1;
        test_exec(cgrp, opt);
+       txmsg_ktls_skb_redir = 0;
+
+       /* Tests that omit skb_parser */
+       txmsg_omit_skb_parser = 1;
+       ktls = 0;
+       txmsg_ktls_skb = 0;
+       test_exec(cgrp, opt);
+
+       txmsg_ktls_skb_drop = 1;
+       test_exec(cgrp, opt);
+       txmsg_ktls_skb_drop = 0;
+
+       txmsg_ktls_skb_redir = 1;
+       test_exec(cgrp, opt);
+
+       ktls = 1;
+       test_exec(cgrp, opt);
+       txmsg_omit_skb_parser = 0;
 
        opt->data_test = data;
        ktls = k;
 }
 
-
 /* Test cork with hung data. This tests poor usage patterns where
  * cork can leave data on the ring if user program is buggy and
  * doesn't flush them somehow. They do take some time however
diff --git a/tools/testing/selftests/bpf/test_tc_neigh.sh b/tools/testing/selftests/bpf/test_tc_neigh.sh
deleted file mode 100755 (executable)
index 31d8c3d..0000000
+++ /dev/null
@@ -1,168 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-#
-# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
-# between src and dst. The netns fwd has veth links to each src and dst. The
-# client is in src and server in dst. The test installs a TC BPF program to each
-# host facing veth in fwd which calls into bpf_redirect_peer() to perform the
-# neigh addr population and redirect; it also installs a dropper prog on the
-# egress side to drop skbs if neigh addrs were not populated.
-
-if [[ $EUID -ne 0 ]]; then
-       echo "This script must be run as root"
-       echo "FAIL"
-       exit 1
-fi
-
-# check that nc, dd, ping, ping6 and timeout are present
-command -v nc >/dev/null 2>&1 || \
-       { echo >&2 "nc is not available"; exit 1; }
-command -v dd >/dev/null 2>&1 || \
-       { echo >&2 "dd is not available"; exit 1; }
-command -v timeout >/dev/null 2>&1 || \
-       { echo >&2 "timeout is not available"; exit 1; }
-command -v ping >/dev/null 2>&1 || \
-       { echo >&2 "ping is not available"; exit 1; }
-command -v ping6 >/dev/null 2>&1 || \
-       { echo >&2 "ping6 is not available"; exit 1; }
-
-readonly GREEN='\033[0;92m'
-readonly RED='\033[0;31m'
-readonly NC='\033[0m' # No Color
-
-readonly PING_ARG="-c 3 -w 10 -q"
-
-readonly TIMEOUT=10
-
-readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)"
-readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)"
-readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)"
-
-readonly IP4_SRC="172.16.1.100"
-readonly IP4_DST="172.16.2.100"
-
-readonly IP6_SRC="::1:dead:beef:cafe"
-readonly IP6_DST="::2:dead:beef:cafe"
-
-readonly IP4_SLL="169.254.0.1"
-readonly IP4_DLL="169.254.0.2"
-readonly IP4_NET="169.254.0.0"
-
-cleanup()
-{
-       ip netns del ${NS_SRC}
-       ip netns del ${NS_FWD}
-       ip netns del ${NS_DST}
-}
-
-trap cleanup EXIT
-
-set -e
-
-ip netns add "${NS_SRC}"
-ip netns add "${NS_FWD}"
-ip netns add "${NS_DST}"
-
-ip link add veth_src type veth peer name veth_src_fwd
-ip link add veth_dst type veth peer name veth_dst_fwd
-
-ip link set veth_src netns ${NS_SRC}
-ip link set veth_src_fwd netns ${NS_FWD}
-
-ip link set veth_dst netns ${NS_DST}
-ip link set veth_dst_fwd netns ${NS_FWD}
-
-ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src
-ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst
-
-# The fwd netns automatically get a v6 LL address / routes, but also needs v4
-# one in order to start ARP probing. IP4_NET route is added to the endpoints
-# so that the ARP processing will reply.
-
-ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd
-ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd
-
-ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad
-ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad
-
-ip -netns ${NS_SRC} link set dev veth_src up
-ip -netns ${NS_FWD} link set dev veth_src_fwd up
-
-ip -netns ${NS_DST} link set dev veth_dst up
-ip -netns ${NS_FWD} link set dev veth_dst_fwd up
-
-ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global
-ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global
-ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global
-
-ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global
-ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global
-
-ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global
-ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global
-ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global
-
-ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global
-ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global
-
-fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address)
-fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address)
-
-ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src
-ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst
-
-ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src
-ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst
-
-veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex | awk '{printf "%08x\n", $1}')
-veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex | awk '{printf "%08x\n", $1}')
-
-xxd -p < test_tc_neigh.o   | sed "s/eeddddee/$veth_src/g" | xxd -r -p > test_tc_neigh.x.o
-xxd -p < test_tc_neigh.x.o | sed "s/eeffffee/$veth_dst/g" | xxd -r -p > test_tc_neigh.y.o
-
-ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact
-ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj test_tc_neigh.y.o sec src_ingress
-ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress  bpf da obj test_tc_neigh.y.o sec chk_neigh
-
-ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact
-ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj test_tc_neigh.y.o sec dst_ingress
-ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress  bpf da obj test_tc_neigh.y.o sec chk_neigh
-
-rm -f test_tc_neigh.x.o test_tc_neigh.y.o
-
-ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &"
-ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &"
-
-set +e
-
-TEST="TCPv4 connectivity test"
-ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004"
-if [ $? -ne 0 ]; then
-       echo -e "${TEST}: ${RED}FAIL${NC}"
-       exit 1
-fi
-echo -e "${TEST}: ${GREEN}PASS${NC}"
-
-TEST="TCPv6 connectivity test"
-ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006"
-if [ $? -ne 0 ]; then
-       echo -e "${TEST}: ${RED}FAIL${NC}"
-       exit 1
-fi
-echo -e "${TEST}: ${GREEN}PASS${NC}"
-
-TEST="ICMPv4 connectivity test"
-ip netns exec ${NS_SRC} ping  $PING_ARG ${IP4_DST}
-if [ $? -ne 0 ]; then
-       echo -e "${TEST}: ${RED}FAIL${NC}"
-       exit 1
-fi
-echo -e "${TEST}: ${GREEN}PASS${NC}"
-
-TEST="ICMPv6 connectivity test"
-ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST}
-if [ $? -ne 0 ]; then
-       echo -e "${TEST}: ${RED}FAIL${NC}"
-       exit 1
-fi
-echo -e "${TEST}: ${GREEN}PASS${NC}"
diff --git a/tools/testing/selftests/bpf/test_tc_redirect.sh b/tools/testing/selftests/bpf/test_tc_redirect.sh
new file mode 100755 (executable)
index 0000000..6d74825
--- /dev/null
@@ -0,0 +1,204 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
+# between src and dst. The netns fwd has veth links to each src and dst. The
+# client is in src and server in dst. The test installs a TC BPF program to each
+# host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the
+# neigh addr population and redirect or ii) bpf_redirect_peer() for namespace
+# switch from ingress side; it also installs a checker prog on the egress side
+# to drop unexpected traffic.
+
+if [[ $EUID -ne 0 ]]; then
+       echo "This script must be run as root"
+       echo "FAIL"
+       exit 1
+fi
+
+# check that needed tools are present
+command -v nc >/dev/null 2>&1 || \
+       { echo >&2 "nc is not available"; exit 1; }
+command -v dd >/dev/null 2>&1 || \
+       { echo >&2 "dd is not available"; exit 1; }
+command -v timeout >/dev/null 2>&1 || \
+       { echo >&2 "timeout is not available"; exit 1; }
+command -v ping >/dev/null 2>&1 || \
+       { echo >&2 "ping is not available"; exit 1; }
+command -v ping6 >/dev/null 2>&1 || \
+       { echo >&2 "ping6 is not available"; exit 1; }
+command -v perl >/dev/null 2>&1 || \
+       { echo >&2 "perl is not available"; exit 1; }
+command -v jq >/dev/null 2>&1 || \
+       { echo >&2 "jq is not available"; exit 1; }
+command -v bpftool >/dev/null 2>&1 || \
+       { echo >&2 "bpftool is not available"; exit 1; }
+
+readonly GREEN='\033[0;92m'
+readonly RED='\033[0;31m'
+readonly NC='\033[0m' # No Color
+
+readonly PING_ARG="-c 3 -w 10 -q"
+
+readonly TIMEOUT=10
+
+readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)"
+readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)"
+readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)"
+
+readonly IP4_SRC="172.16.1.100"
+readonly IP4_DST="172.16.2.100"
+
+readonly IP6_SRC="::1:dead:beef:cafe"
+readonly IP6_DST="::2:dead:beef:cafe"
+
+readonly IP4_SLL="169.254.0.1"
+readonly IP4_DLL="169.254.0.2"
+readonly IP4_NET="169.254.0.0"
+
+netns_cleanup()
+{
+       ip netns del ${NS_SRC}
+       ip netns del ${NS_FWD}
+       ip netns del ${NS_DST}
+}
+
+netns_setup()
+{
+       ip netns add "${NS_SRC}"
+       ip netns add "${NS_FWD}"
+       ip netns add "${NS_DST}"
+
+       ip link add veth_src type veth peer name veth_src_fwd
+       ip link add veth_dst type veth peer name veth_dst_fwd
+
+       ip link set veth_src netns ${NS_SRC}
+       ip link set veth_src_fwd netns ${NS_FWD}
+
+       ip link set veth_dst netns ${NS_DST}
+       ip link set veth_dst_fwd netns ${NS_FWD}
+
+       ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src
+       ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst
+
+       # The fwd netns automatically get a v6 LL address / routes, but also
+       # needs v4 one in order to start ARP probing. IP4_NET route is added
+       # to the endpoints so that the ARP processing will reply.
+
+       ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd
+       ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd
+
+       ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad
+       ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad
+
+       ip -netns ${NS_SRC} link set dev veth_src up
+       ip -netns ${NS_FWD} link set dev veth_src_fwd up
+
+       ip -netns ${NS_DST} link set dev veth_dst up
+       ip -netns ${NS_FWD} link set dev veth_dst_fwd up
+
+       ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global
+       ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global
+       ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global
+
+       ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global
+       ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global
+
+       ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global
+       ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global
+       ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global
+
+       ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global
+       ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global
+
+       fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address)
+       fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address)
+
+       ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src
+       ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst
+
+       ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src
+       ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst
+}
+
+netns_test_connectivity()
+{
+       set +e
+
+       ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &"
+       ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &"
+
+       TEST="TCPv4 connectivity test"
+       ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004"
+       if [ $? -ne 0 ]; then
+               echo -e "${TEST}: ${RED}FAIL${NC}"
+               exit 1
+       fi
+       echo -e "${TEST}: ${GREEN}PASS${NC}"
+
+       TEST="TCPv6 connectivity test"
+       ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006"
+       if [ $? -ne 0 ]; then
+               echo -e "${TEST}: ${RED}FAIL${NC}"
+               exit 1
+       fi
+       echo -e "${TEST}: ${GREEN}PASS${NC}"
+
+       TEST="ICMPv4 connectivity test"
+       ip netns exec ${NS_SRC} ping  $PING_ARG ${IP4_DST}
+       if [ $? -ne 0 ]; then
+               echo -e "${TEST}: ${RED}FAIL${NC}"
+               exit 1
+       fi
+       echo -e "${TEST}: ${GREEN}PASS${NC}"
+
+       TEST="ICMPv6 connectivity test"
+       ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST}
+       if [ $? -ne 0 ]; then
+               echo -e "${TEST}: ${RED}FAIL${NC}"
+               exit 1
+       fi
+       echo -e "${TEST}: ${GREEN}PASS${NC}"
+
+       set -e
+}
+
+hex_mem_str()
+{
+       perl -e 'print join(" ", unpack("(H2)8", pack("L", @ARGV)))' $1
+}
+
+netns_setup_bpf()
+{
+       local obj=$1
+
+       ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact
+       ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj $obj sec src_ingress
+       ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress  bpf da obj $obj sec chk_egress
+
+       ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact
+       ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj $obj sec dst_ingress
+       ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress  bpf da obj $obj sec chk_egress
+
+       veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex)
+       veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex)
+
+       progs=$(ip netns exec ${NS_FWD} bpftool net --json | jq -r '.[] | .tc | map(.id) | .[]')
+       for prog in $progs; do
+               map=$(bpftool prog show id $prog --json | jq -r '.map_ids | .? | .[]')
+               if [ ! -z "$map" ]; then
+                       bpftool map update id $map key hex $(hex_mem_str 0) value hex $(hex_mem_str $veth_src)
+                       bpftool map update id $map key hex $(hex_mem_str 1) value hex $(hex_mem_str $veth_dst)
+               fi
+       done
+}
+
+trap netns_cleanup EXIT
+set -e
+
+netns_setup
+netns_setup_bpf test_tc_neigh.o
+netns_test_connectivity
+netns_cleanup
+netns_setup
+netns_setup_bpf test_tc_peer.o
+netns_test_connectivity
index 78a8cf9..6118e3a 100644 (file)
@@ -110,12 +110,13 @@ static inline void clear_hdr_cb_flags(struct bpf_sock_ops *skops)
                                    BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG));
 }
 
-static inline void set_hdr_cb_flags(struct bpf_sock_ops *skops)
+static inline void set_hdr_cb_flags(struct bpf_sock_ops *skops, __u32 extra)
 {
        bpf_sock_ops_cb_flags_set(skops,
                                  skops->bpf_sock_ops_cb_flags |
                                  BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG |
-                                 BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG);
+                                 BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG |
+                                 extra);
 }
 static inline void
 clear_parse_all_hdr_cb_flags(struct bpf_sock_ops *skops)
index 4d0e913..1bbd1d9 100644 (file)
@@ -90,6 +90,33 @@ long ksym_get_addr(const char *name)
        return 0;
 }
 
+/* open kallsyms and read symbol addresses on the fly. Without caching all symbols,
+ * this is faster than load + find.
+ */
+int kallsyms_find(const char *sym, unsigned long long *addr)
+{
+       char type, name[500];
+       unsigned long long value;
+       int err = 0;
+       FILE *f;
+
+       f = fopen("/proc/kallsyms", "r");
+       if (!f)
+               return -EINVAL;
+
+       while (fscanf(f, "%llx %c %499s%*[^\n]\n", &value, &type, name) > 0) {
+               if (strcmp(name, sym) == 0) {
+                       *addr = value;
+                       goto out;
+               }
+       }
+       err = -ENOENT;
+
+out:
+       fclose(f);
+       return err;
+}
+
 void read_trace_pipe(void)
 {
        int trace_fd;
index 25ef597..f62fdef 100644 (file)
@@ -12,6 +12,10 @@ struct ksym {
 int load_kallsyms(void);
 struct ksym *ksym_search(long key);
 long ksym_get_addr(const char *name);
+
+/* open kallsyms and find addresses on the fly, faster than load + search. */
+int kallsyms_find(const char *sym, unsigned long long *addr);
+
 void read_trace_pipe(void);
 
 #endif
index b8d1864..de84f0d 100644 (file)
@@ -2,7 +2,7 @@
        "empty prog",
        .insns = {
        },
-       .errstr = "unknown opcode 00",
+       .errstr = "last insn is not an exit or jmp",
        .result = REJECT,
 },
 {
index 2c5fbe7..ae72536 100644 (file)
        },
        .prog_type = BPF_PROG_TYPE_SCHED_CLS,
        .result = REJECT,
-       .errstr = "invalid access to packet, off=0 size=8, R5(id=1,off=0,r=0)",
+       .errstr = "invalid access to packet, off=0 size=8, R5(id=2,off=0,r=0)",
        .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 },
 {
index 3856dba..f929790 100644 (file)
        .errstr = "invalid bpf_ld_imm64 insn",
        .result = REJECT,
 },
-{
-       "test5 ld_imm64",
-       .insns = {
-       BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0),
-       },
-       .errstr = "invalid bpf_ld_imm64 insn",
-       .result = REJECT,
-},
 {
        "test6 ld_imm64",
        .insns = {
diff --git a/tools/testing/selftests/bpf/verifier/regalloc.c b/tools/testing/selftests/bpf/verifier/regalloc.c
new file mode 100644 (file)
index 0000000..ac71b82
--- /dev/null
@@ -0,0 +1,243 @@
+{
+       "regalloc basic",
+       .insns = {
+       BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+       BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_LD_MAP_FD(BPF_REG_1, 0),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+       BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+       BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+       BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 4),
+       BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 3),
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
+       BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+       BPF_EXIT_INSN(),
+       },
+       .fixup_map_hash_48b = { 4 },
+       .result = ACCEPT,
+       .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+       "regalloc negative",
+       .insns = {
+       BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+       BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_LD_MAP_FD(BPF_REG_1, 0),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+       BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+       BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+       BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 24, 4),
+       BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 3),
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
+       BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_7, 0),
+       BPF_EXIT_INSN(),
+       },
+       .fixup_map_hash_48b = { 4 },
+       .result = REJECT,
+       .errstr = "invalid access to map value, value_size=48 off=48 size=1",
+       .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+       "regalloc src_reg mark",
+       .insns = {
+       BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+       BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_LD_MAP_FD(BPF_REG_1, 0),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+       BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+       BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+       BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 5),
+       BPF_MOV64_IMM(BPF_REG_3, 0),
+       BPF_JMP_REG(BPF_JSGE, BPF_REG_3, BPF_REG_2, 3),
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
+       BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+       BPF_EXIT_INSN(),
+       },
+       .fixup_map_hash_48b = { 4 },
+       .result = ACCEPT,
+       .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+       "regalloc src_reg negative",
+       .insns = {
+       BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+       BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_LD_MAP_FD(BPF_REG_1, 0),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+       BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+       BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+       BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 22, 5),
+       BPF_MOV64_IMM(BPF_REG_3, 0),
+       BPF_JMP_REG(BPF_JSGE, BPF_REG_3, BPF_REG_2, 3),
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
+       BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+       BPF_EXIT_INSN(),
+       },
+       .fixup_map_hash_48b = { 4 },
+       .result = REJECT,
+       .errstr = "invalid access to map value, value_size=48 off=44 size=8",
+       .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+       "regalloc and spill",
+       .insns = {
+       BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+       BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_LD_MAP_FD(BPF_REG_1, 0),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
+       BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+       BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+       BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 7),
+       /* r0 has upper bound that should propagate into r2 */
+       BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* spill r2 */
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_2, 0), /* clear r0 and r2 */
+       BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_10, -8), /* fill r3 */
+       BPF_JMP_REG(BPF_JSGE, BPF_REG_0, BPF_REG_3, 2),
+       /* r3 has lower and upper bounds */
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_3),
+       BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+       BPF_EXIT_INSN(),
+       },
+       .fixup_map_hash_48b = { 4 },
+       .result = ACCEPT,
+       .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+       "regalloc and spill negative",
+       .insns = {
+       BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+       BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_LD_MAP_FD(BPF_REG_1, 0),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
+       BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+       BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+       BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 48, 7),
+       /* r0 has upper bound that should propagate into r2 */
+       BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* spill r2 */
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_MOV64_IMM(BPF_REG_2, 0), /* clear r0 and r2 */
+       BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_10, -8), /* fill r3 */
+       BPF_JMP_REG(BPF_JSGE, BPF_REG_0, BPF_REG_3, 2),
+       /* r3 has lower and upper bounds */
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_3),
+       BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+       BPF_EXIT_INSN(),
+       },
+       .fixup_map_hash_48b = { 4 },
+       .result = REJECT,
+       .errstr = "invalid access to map value, value_size=48 off=48 size=8",
+       .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+       "regalloc three regs",
+       .insns = {
+       BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+       BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_LD_MAP_FD(BPF_REG_1, 0),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10),
+       BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+       BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+       BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+       BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 12, 5),
+       BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 4),
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_4),
+       BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+       BPF_EXIT_INSN(),
+       },
+       .fixup_map_hash_48b = { 4 },
+       .result = ACCEPT,
+       .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+       "regalloc after call",
+       .insns = {
+       BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+       BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_LD_MAP_FD(BPF_REG_1, 0),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10),
+       BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+       BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+       BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+       BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6),
+       BPF_JMP_IMM(BPF_JSGT, BPF_REG_8, 20, 4),
+       BPF_JMP_IMM(BPF_JSLT, BPF_REG_9, 0, 3),
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_8),
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_9),
+       BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+       BPF_EXIT_INSN(),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .fixup_map_hash_48b = { 4 },
+       .result = ACCEPT,
+       .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+       "regalloc in callee",
+       .insns = {
+       BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+       BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+       BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+       BPF_LD_MAP_FD(BPF_REG_1, 0),
+       BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+       BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+       BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+       BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+       BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+       BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+       BPF_MOV64_REG(BPF_REG_3, BPF_REG_7),
+       BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+       BPF_EXIT_INSN(),
+       BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 20, 5),
+       BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 4),
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_1),
+       BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_2),
+       BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0),
+       BPF_EXIT_INSN(),
+       BPF_MOV64_IMM(BPF_REG_0, 0),
+       BPF_EXIT_INSN(),
+       },
+       .fixup_map_hash_48b = { 4 },
+       .result = ACCEPT,
+       .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},