1 /* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
3 * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
5 #include <uapi/linux/if_ether.h>
6 #include <uapi/linux/if_packet.h>
7 #include <uapi/linux/if_vlan.h>
8 #include <uapi/linux/ip.h>
9 #include <uapi/linux/ipv6.h>
10 #include <uapi/linux/in.h>
11 #include <uapi/linux/tcp.h>
12 #include <uapi/linux/udp.h>
14 #include <uapi/linux/bpf.h>
15 #include "bpf_helpers.h"
17 #define MAX_CPUS 12 /* WARNING - sync with _user.c */
19 /* Special map type that can XDP_REDIRECT frames to another CPU */
20 struct bpf_map_def SEC("maps") cpu_map = {
21 .type = BPF_MAP_TYPE_CPUMAP,
22 .key_size = sizeof(u32),
23 .value_size = sizeof(u32),
24 .max_entries = MAX_CPUS,
27 /* Common stats data record to keep userspace more simple */
34 /* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
35 * feedback. Redirect TX errors can be caught via a tracepoint.
37 struct bpf_map_def SEC("maps") rx_cnt = {
38 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
39 .key_size = sizeof(u32),
40 .value_size = sizeof(struct datarec),
44 /* Used by trace point */
45 struct bpf_map_def SEC("maps") redirect_err_cnt = {
46 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
47 .key_size = sizeof(u32),
48 .value_size = sizeof(struct datarec),
50 /* TODO: have entries for all possible errno's */
53 /* Used by trace point */
54 struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
55 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
56 .key_size = sizeof(u32),
57 .value_size = sizeof(struct datarec),
58 .max_entries = MAX_CPUS,
61 /* Used by trace point */
62 struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
63 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
64 .key_size = sizeof(u32),
65 .value_size = sizeof(struct datarec),
69 /* Set of maps controlling available CPU, and for iterating through
70 * selectable redirect CPUs.
72 struct bpf_map_def SEC("maps") cpus_available = {
73 .type = BPF_MAP_TYPE_ARRAY,
74 .key_size = sizeof(u32),
75 .value_size = sizeof(u32),
76 .max_entries = MAX_CPUS,
78 struct bpf_map_def SEC("maps") cpus_count = {
79 .type = BPF_MAP_TYPE_ARRAY,
80 .key_size = sizeof(u32),
81 .value_size = sizeof(u32),
84 struct bpf_map_def SEC("maps") cpus_iterator = {
85 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
86 .key_size = sizeof(u32),
87 .value_size = sizeof(u32),
91 /* Used by trace point */
92 struct bpf_map_def SEC("maps") exception_cnt = {
93 .type = BPF_MAP_TYPE_PERCPU_ARRAY,
94 .key_size = sizeof(u32),
95 .value_size = sizeof(struct datarec),
99 /* Helper parse functions */
101 /* Parse Ethernet layer 2, extract network layer 3 offset and protocol
103 * Returns false on error and non-supported ether-type
107 __be16 h_vlan_encapsulated_proto;
110 static __always_inline
111 bool parse_eth(struct ethhdr *eth, void *data_end,
112 u16 *eth_proto, u64 *l3_offset)
117 offset = sizeof(*eth);
118 if ((void *)eth + offset > data_end)
121 eth_type = eth->h_proto;
123 /* Skip non 802.3 Ethertypes */
124 if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN))
127 /* Handle VLAN tagged packet */
128 if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
129 struct vlan_hdr *vlan_hdr;
131 vlan_hdr = (void *)eth + offset;
132 offset += sizeof(*vlan_hdr);
133 if ((void *)eth + offset > data_end)
135 eth_type = vlan_hdr->h_vlan_encapsulated_proto;
137 /* TODO: Handle double VLAN tagged packet */
139 *eth_proto = ntohs(eth_type);
144 static __always_inline
145 u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
147 void *data_end = (void *)(long)ctx->data_end;
148 void *data = (void *)(long)ctx->data;
149 struct iphdr *iph = data + nh_off;
153 if (iph + 1 > data_end)
155 if (!(iph->protocol == IPPROTO_UDP))
158 udph = (void *)(iph + 1);
159 if (udph + 1 > data_end)
162 dport = ntohs(udph->dest);
166 static __always_inline
167 int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off)
169 void *data_end = (void *)(long)ctx->data_end;
170 void *data = (void *)(long)ctx->data;
171 struct iphdr *iph = data + nh_off;
173 if (iph + 1 > data_end)
175 return iph->protocol;
178 static __always_inline
179 int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off)
181 void *data_end = (void *)(long)ctx->data_end;
182 void *data = (void *)(long)ctx->data;
183 struct ipv6hdr *ip6h = data + nh_off;
185 if (ip6h + 1 > data_end)
187 return ip6h->nexthdr;
191 int xdp_prognum0_no_touch(struct xdp_md *ctx)
193 void *data_end = (void *)(long)ctx->data_end;
194 void *data = (void *)(long)ctx->data;
200 /* Only use first entry in cpus_available */
201 cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
204 cpu_dest = *cpu_selected;
206 /* Count RX packet in map */
207 rec = bpf_map_lookup_elem(&rx_cnt, &key);
212 if (cpu_dest >= MAX_CPUS) {
217 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
220 SEC("xdp_cpu_map1_touch_data")
221 int xdp_prognum1_touch_data(struct xdp_md *ctx)
223 void *data_end = (void *)(long)ctx->data_end;
224 void *data = (void *)(long)ctx->data;
225 struct ethhdr *eth = data;
232 /* Only use first entry in cpus_available */
233 cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
236 cpu_dest = *cpu_selected;
238 /* Validate packet length is minimum Eth header size */
239 if (eth + 1 > data_end)
242 /* Count RX packet in map */
243 rec = bpf_map_lookup_elem(&rx_cnt, &key);
248 /* Read packet data, and use it (drop non 802.3 Ethertypes) */
249 eth_type = eth->h_proto;
250 if (ntohs(eth_type) < ETH_P_802_3_MIN) {
255 if (cpu_dest >= MAX_CPUS) {
260 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
263 SEC("xdp_cpu_map2_round_robin")
264 int xdp_prognum2_round_robin(struct xdp_md *ctx)
266 void *data_end = (void *)(long)ctx->data_end;
267 void *data = (void *)(long)ctx->data;
268 struct ethhdr *eth = data;
279 cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
283 cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0);
286 cpu_idx = *cpu_iterator;
289 if (*cpu_iterator == *cpu_max)
292 cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
295 cpu_dest = *cpu_selected;
297 /* Count RX packet in map */
298 rec = bpf_map_lookup_elem(&rx_cnt, &key0);
303 if (cpu_dest >= MAX_CPUS) {
308 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
311 SEC("xdp_cpu_map3_proto_separate")
312 int xdp_prognum3_proto_separate(struct xdp_md *ctx)
314 void *data_end = (void *)(long)ctx->data_end;
315 void *data = (void *)(long)ctx->data;
316 struct ethhdr *eth = data;
317 u8 ip_proto = IPPROTO_UDP;
326 /* Count RX packet in map */
327 rec = bpf_map_lookup_elem(&rx_cnt, &key);
332 if (!(parse_eth(eth, data_end, ð_proto, &l3_offset)))
333 return XDP_PASS; /* Just skip */
335 /* Extract L4 protocol */
338 ip_proto = get_proto_ipv4(ctx, l3_offset);
341 ip_proto = get_proto_ipv6(ctx, l3_offset);
344 cpu_idx = 0; /* ARP packet handled on separate CPU */
350 /* Choose CPU based on L4 protocol */
366 cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
369 cpu_dest = *cpu_lookup;
371 if (cpu_dest >= MAX_CPUS) {
376 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
379 SEC("xdp_cpu_map4_ddos_filter_pktgen")
380 int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
382 void *data_end = (void *)(long)ctx->data_end;
383 void *data = (void *)(long)ctx->data;
384 struct ethhdr *eth = data;
385 u8 ip_proto = IPPROTO_UDP;
395 /* Count RX packet in map */
396 rec = bpf_map_lookup_elem(&rx_cnt, &key);
401 if (!(parse_eth(eth, data_end, ð_proto, &l3_offset)))
402 return XDP_PASS; /* Just skip */
404 /* Extract L4 protocol */
407 ip_proto = get_proto_ipv4(ctx, l3_offset);
410 ip_proto = get_proto_ipv6(ctx, l3_offset);
413 cpu_idx = 0; /* ARP packet handled on separate CPU */
419 /* Choose CPU based on L4 protocol */
430 /* DDoS filter UDP port 9 (pktgen) */
431 dest_port = get_dest_port_ipv4_udp(ctx, l3_offset);
432 if (dest_port == 9) {
442 cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
445 cpu_dest = *cpu_lookup;
447 if (cpu_dest >= MAX_CPUS) {
452 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
456 char _license[] SEC("license") = "GPL";
458 /*** Trace point code ***/
460 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
461 * Code in: kernel/include/trace/events/xdp.h
463 struct xdp_redirect_ctx {
464 u64 __pad; // First 8 bytes are not accessible by bpf code
465 int prog_id; // offset:8; size:4; signed:1;
466 u32 act; // offset:12 size:4; signed:0;
467 int ifindex; // offset:16 size:4; signed:1;
468 int err; // offset:20 size:4; signed:1;
469 int to_ifindex; // offset:24 size:4; signed:1;
470 u32 map_id; // offset:28 size:4; signed:0;
471 int map_index; // offset:32 size:4; signed:1;
475 XDP_REDIRECT_SUCCESS = 0,
476 XDP_REDIRECT_ERROR = 1
479 static __always_inline
480 int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
482 u32 key = XDP_REDIRECT_ERROR;
487 key = XDP_REDIRECT_SUCCESS;
489 rec = bpf_map_lookup_elem(&redirect_err_cnt, &key);
494 return 0; /* Indicate event was filtered (no further processing)*/
496 * Returning 1 here would allow e.g. a perf-record tracepoint
497 * to see and record these events, but it doesn't work well
498 * in-practice as stopping perf-record also unload this
499 * bpf_prog. Plus, there is additional overhead of doing so.
503 SEC("tracepoint/xdp/xdp_redirect_err")
504 int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
506 return xdp_redirect_collect_stat(ctx);
509 SEC("tracepoint/xdp/xdp_redirect_map_err")
510 int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
512 return xdp_redirect_collect_stat(ctx);
515 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
516 * Code in: kernel/include/trace/events/xdp.h
518 struct xdp_exception_ctx {
519 u64 __pad; // First 8 bytes are not accessible by bpf code
520 int prog_id; // offset:8; size:4; signed:1;
521 u32 act; // offset:12; size:4; signed:0;
522 int ifindex; // offset:16; size:4; signed:1;
525 SEC("tracepoint/xdp/xdp_exception")
526 int trace_xdp_exception(struct xdp_exception_ctx *ctx)
531 rec = bpf_map_lookup_elem(&exception_cnt, &key);
539 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
540 * Code in: kernel/include/trace/events/xdp.h
542 struct cpumap_enqueue_ctx {
543 u64 __pad; // First 8 bytes are not accessible by bpf code
544 int map_id; // offset:8; size:4; signed:1;
545 u32 act; // offset:12; size:4; signed:0;
546 int cpu; // offset:16; size:4; signed:1;
547 unsigned int drops; // offset:20; size:4; signed:0;
548 unsigned int processed; // offset:24; size:4; signed:0;
549 int to_cpu; // offset:28; size:4; signed:1;
552 SEC("tracepoint/xdp/xdp_cpumap_enqueue")
553 int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
555 u32 to_cpu = ctx->to_cpu;
558 if (to_cpu >= MAX_CPUS)
561 rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
564 rec->processed += ctx->processed;
565 rec->dropped += ctx->drops;
567 /* Record bulk events, then userspace can calc average bulk size */
568 if (ctx->processed > 0)
571 /* Inception: It's possible to detect overload situations, via
572 * this tracepoint. This can be used for creating a feedback
573 * loop to XDP, which can take appropriate actions to mitigate
574 * this overload situation.
579 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
580 * Code in: kernel/include/trace/events/xdp.h
582 struct cpumap_kthread_ctx {
583 u64 __pad; // First 8 bytes are not accessible by bpf code
584 int map_id; // offset:8; size:4; signed:1;
585 u32 act; // offset:12; size:4; signed:0;
586 int cpu; // offset:16; size:4; signed:1;
587 unsigned int drops; // offset:20; size:4; signed:0;
588 unsigned int processed; // offset:24; size:4; signed:0;
589 int sched; // offset:28; size:4; signed:1;
592 SEC("tracepoint/xdp/xdp_cpumap_kthread")
593 int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
598 rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
601 rec->processed += ctx->processed;
602 rec->dropped += ctx->drops;
604 /* Count times kthread yielded CPU via schedule call */