1 /* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
3 * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
5 #include <uapi/linux/if_ether.h>
6 #include <uapi/linux/if_packet.h>
7 #include <uapi/linux/if_vlan.h>
8 #include <uapi/linux/ip.h>
9 #include <uapi/linux/ipv6.h>
10 #include <uapi/linux/in.h>
11 #include <uapi/linux/tcp.h>
12 #include <uapi/linux/udp.h>
14 #include <uapi/linux/bpf.h>
15 #include <bpf/bpf_helpers.h>
16 #include "hash_func01.h"
18 #define MAX_CPUS NR_CPUS
20 /* Special map type that can XDP_REDIRECT frames to another CPU */
22 __uint(type, BPF_MAP_TYPE_CPUMAP);
23 __uint(key_size, sizeof(u32));
24 __uint(value_size, sizeof(struct bpf_cpumap_val));
25 __uint(max_entries, MAX_CPUS);
26 } cpu_map SEC(".maps");
28 /* Common stats data record to keep userspace more simple */
38 /* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
39 * feedback. Redirect TX errors can be caught via a tracepoint.
42 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
44 __type(value, struct datarec);
45 __uint(max_entries, 1);
46 } rx_cnt SEC(".maps");
48 /* Used by trace point */
50 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
52 __type(value, struct datarec);
53 __uint(max_entries, 2);
54 /* TODO: have entries for all possible errno's */
55 } redirect_err_cnt SEC(".maps");
57 /* Used by trace point */
59 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
61 __type(value, struct datarec);
62 __uint(max_entries, MAX_CPUS);
63 } cpumap_enqueue_cnt SEC(".maps");
65 /* Used by trace point */
67 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
69 __type(value, struct datarec);
70 __uint(max_entries, 1);
71 } cpumap_kthread_cnt SEC(".maps");
73 /* Set of maps controlling available CPU, and for iterating through
74 * selectable redirect CPUs.
77 __uint(type, BPF_MAP_TYPE_ARRAY);
80 __uint(max_entries, MAX_CPUS);
81 } cpus_available SEC(".maps");
83 __uint(type, BPF_MAP_TYPE_ARRAY);
86 __uint(max_entries, 1);
87 } cpus_count SEC(".maps");
89 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
92 __uint(max_entries, 1);
93 } cpus_iterator SEC(".maps");
95 /* Used by trace point */
97 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
99 __type(value, struct datarec);
100 __uint(max_entries, 1);
101 } exception_cnt SEC(".maps");
103 /* Helper parse functions */
105 /* Parse Ethernet layer 2, extract network layer 3 offset and protocol
107 * Returns false on error and non-supported ether-type
111 __be16 h_vlan_encapsulated_proto;
114 static __always_inline
115 bool parse_eth(struct ethhdr *eth, void *data_end,
116 u16 *eth_proto, u64 *l3_offset)
121 offset = sizeof(*eth);
122 if ((void *)eth + offset > data_end)
125 eth_type = eth->h_proto;
127 /* Skip non 802.3 Ethertypes */
128 if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN))
131 /* Handle VLAN tagged packet */
132 if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
133 struct vlan_hdr *vlan_hdr;
135 vlan_hdr = (void *)eth + offset;
136 offset += sizeof(*vlan_hdr);
137 if ((void *)eth + offset > data_end)
139 eth_type = vlan_hdr->h_vlan_encapsulated_proto;
141 /* Handle double VLAN tagged packet */
142 if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
143 struct vlan_hdr *vlan_hdr;
145 vlan_hdr = (void *)eth + offset;
146 offset += sizeof(*vlan_hdr);
147 if ((void *)eth + offset > data_end)
149 eth_type = vlan_hdr->h_vlan_encapsulated_proto;
152 *eth_proto = ntohs(eth_type);
157 static __always_inline
158 u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
160 void *data_end = (void *)(long)ctx->data_end;
161 void *data = (void *)(long)ctx->data;
162 struct iphdr *iph = data + nh_off;
166 if (iph + 1 > data_end)
168 if (!(iph->protocol == IPPROTO_UDP))
171 udph = (void *)(iph + 1);
172 if (udph + 1 > data_end)
175 dport = ntohs(udph->dest);
179 static __always_inline
180 int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off)
182 void *data_end = (void *)(long)ctx->data_end;
183 void *data = (void *)(long)ctx->data;
184 struct iphdr *iph = data + nh_off;
186 if (iph + 1 > data_end)
188 return iph->protocol;
191 static __always_inline
192 int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off)
194 void *data_end = (void *)(long)ctx->data_end;
195 void *data = (void *)(long)ctx->data;
196 struct ipv6hdr *ip6h = data + nh_off;
198 if (ip6h + 1 > data_end)
200 return ip6h->nexthdr;
204 int xdp_prognum0_no_touch(struct xdp_md *ctx)
206 void *data_end = (void *)(long)ctx->data_end;
207 void *data = (void *)(long)ctx->data;
213 /* Only use first entry in cpus_available */
214 cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
217 cpu_dest = *cpu_selected;
219 /* Count RX packet in map */
220 rec = bpf_map_lookup_elem(&rx_cnt, &key);
225 if (cpu_dest >= MAX_CPUS) {
230 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
233 SEC("xdp_cpu_map1_touch_data")
234 int xdp_prognum1_touch_data(struct xdp_md *ctx)
236 void *data_end = (void *)(long)ctx->data_end;
237 void *data = (void *)(long)ctx->data;
238 struct ethhdr *eth = data;
245 /* Only use first entry in cpus_available */
246 cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
249 cpu_dest = *cpu_selected;
251 /* Validate packet length is minimum Eth header size */
252 if (eth + 1 > data_end)
255 /* Count RX packet in map */
256 rec = bpf_map_lookup_elem(&rx_cnt, &key);
261 /* Read packet data, and use it (drop non 802.3 Ethertypes) */
262 eth_type = eth->h_proto;
263 if (ntohs(eth_type) < ETH_P_802_3_MIN) {
268 if (cpu_dest >= MAX_CPUS) {
273 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
276 SEC("xdp_cpu_map2_round_robin")
277 int xdp_prognum2_round_robin(struct xdp_md *ctx)
279 void *data_end = (void *)(long)ctx->data_end;
280 void *data = (void *)(long)ctx->data;
281 struct ethhdr *eth = data;
292 cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
296 cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0);
299 cpu_idx = *cpu_iterator;
302 if (*cpu_iterator == *cpu_max)
305 cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
308 cpu_dest = *cpu_selected;
310 /* Count RX packet in map */
311 rec = bpf_map_lookup_elem(&rx_cnt, &key0);
316 if (cpu_dest >= MAX_CPUS) {
321 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
324 SEC("xdp_cpu_map3_proto_separate")
325 int xdp_prognum3_proto_separate(struct xdp_md *ctx)
327 void *data_end = (void *)(long)ctx->data_end;
328 void *data = (void *)(long)ctx->data;
329 struct ethhdr *eth = data;
330 u8 ip_proto = IPPROTO_UDP;
339 /* Count RX packet in map */
340 rec = bpf_map_lookup_elem(&rx_cnt, &key);
345 if (!(parse_eth(eth, data_end, ð_proto, &l3_offset)))
346 return XDP_PASS; /* Just skip */
348 /* Extract L4 protocol */
351 ip_proto = get_proto_ipv4(ctx, l3_offset);
354 ip_proto = get_proto_ipv6(ctx, l3_offset);
357 cpu_idx = 0; /* ARP packet handled on separate CPU */
363 /* Choose CPU based on L4 protocol */
379 cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
382 cpu_dest = *cpu_lookup;
384 if (cpu_dest >= MAX_CPUS) {
389 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
392 SEC("xdp_cpu_map4_ddos_filter_pktgen")
393 int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
395 void *data_end = (void *)(long)ctx->data_end;
396 void *data = (void *)(long)ctx->data;
397 struct ethhdr *eth = data;
398 u8 ip_proto = IPPROTO_UDP;
408 /* Count RX packet in map */
409 rec = bpf_map_lookup_elem(&rx_cnt, &key);
414 if (!(parse_eth(eth, data_end, ð_proto, &l3_offset)))
415 return XDP_PASS; /* Just skip */
417 /* Extract L4 protocol */
420 ip_proto = get_proto_ipv4(ctx, l3_offset);
423 ip_proto = get_proto_ipv6(ctx, l3_offset);
426 cpu_idx = 0; /* ARP packet handled on separate CPU */
432 /* Choose CPU based on L4 protocol */
443 /* DDoS filter UDP port 9 (pktgen) */
444 dest_port = get_dest_port_ipv4_udp(ctx, l3_offset);
445 if (dest_port == 9) {
455 cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
458 cpu_dest = *cpu_lookup;
460 if (cpu_dest >= MAX_CPUS) {
465 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
468 /* Hashing initval */
469 #define INITVAL 15485863
471 static __always_inline
472 u32 get_ipv4_hash_ip_pair(struct xdp_md *ctx, u64 nh_off)
474 void *data_end = (void *)(long)ctx->data_end;
475 void *data = (void *)(long)ctx->data;
476 struct iphdr *iph = data + nh_off;
479 if (iph + 1 > data_end)
482 cpu_hash = iph->saddr + iph->daddr;
483 cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol);
488 static __always_inline
489 u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off)
491 void *data_end = (void *)(long)ctx->data_end;
492 void *data = (void *)(long)ctx->data;
493 struct ipv6hdr *ip6h = data + nh_off;
496 if (ip6h + 1 > data_end)
499 cpu_hash = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0];
500 cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1];
501 cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2];
502 cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3];
503 cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr);
508 /* Load-Balance traffic based on hashing IP-addrs + L4-proto. The
509 * hashing scheme is symmetric, meaning swapping IP src/dest still hit
512 SEC("xdp_cpu_map5_lb_hash_ip_pairs")
513 int xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx)
515 void *data_end = (void *)(long)ctx->data_end;
516 void *data = (void *)(long)ctx->data;
517 struct ethhdr *eth = data;
518 u8 ip_proto = IPPROTO_UDP;
529 /* Count RX packet in map */
530 rec = bpf_map_lookup_elem(&rx_cnt, &key);
535 cpu_max = bpf_map_lookup_elem(&cpus_count, &key);
539 if (!(parse_eth(eth, data_end, ð_proto, &l3_offset)))
540 return XDP_PASS; /* Just skip */
542 /* Hash for IPv4 and IPv6 */
545 cpu_hash = get_ipv4_hash_ip_pair(ctx, l3_offset);
548 cpu_hash = get_ipv6_hash_ip_pair(ctx, l3_offset);
550 case ETH_P_ARP: /* ARP packet handled on CPU idx 0 */
555 /* Choose CPU based on hash */
556 cpu_idx = cpu_hash % *cpu_max;
558 cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
561 cpu_dest = *cpu_lookup;
563 if (cpu_dest >= MAX_CPUS) {
568 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
571 char _license[] SEC("license") = "GPL";
573 /*** Trace point code ***/
575 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
576 * Code in: kernel/include/trace/events/xdp.h
578 struct xdp_redirect_ctx {
579 u64 __pad; // First 8 bytes are not accessible by bpf code
580 int prog_id; // offset:8; size:4; signed:1;
581 u32 act; // offset:12 size:4; signed:0;
582 int ifindex; // offset:16 size:4; signed:1;
583 int err; // offset:20 size:4; signed:1;
584 int to_ifindex; // offset:24 size:4; signed:1;
585 u32 map_id; // offset:28 size:4; signed:0;
586 int map_index; // offset:32 size:4; signed:1;
590 XDP_REDIRECT_SUCCESS = 0,
591 XDP_REDIRECT_ERROR = 1
594 static __always_inline
595 int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
597 u32 key = XDP_REDIRECT_ERROR;
602 key = XDP_REDIRECT_SUCCESS;
604 rec = bpf_map_lookup_elem(&redirect_err_cnt, &key);
609 return 0; /* Indicate event was filtered (no further processing)*/
611 * Returning 1 here would allow e.g. a perf-record tracepoint
612 * to see and record these events, but it doesn't work well
613 * in-practice as stopping perf-record also unload this
614 * bpf_prog. Plus, there is additional overhead of doing so.
618 SEC("tracepoint/xdp/xdp_redirect_err")
619 int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
621 return xdp_redirect_collect_stat(ctx);
624 SEC("tracepoint/xdp/xdp_redirect_map_err")
625 int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
627 return xdp_redirect_collect_stat(ctx);
630 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
631 * Code in: kernel/include/trace/events/xdp.h
633 struct xdp_exception_ctx {
634 u64 __pad; // First 8 bytes are not accessible by bpf code
635 int prog_id; // offset:8; size:4; signed:1;
636 u32 act; // offset:12; size:4; signed:0;
637 int ifindex; // offset:16; size:4; signed:1;
640 SEC("tracepoint/xdp/xdp_exception")
641 int trace_xdp_exception(struct xdp_exception_ctx *ctx)
646 rec = bpf_map_lookup_elem(&exception_cnt, &key);
654 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
655 * Code in: kernel/include/trace/events/xdp.h
657 struct cpumap_enqueue_ctx {
658 u64 __pad; // First 8 bytes are not accessible by bpf code
659 int map_id; // offset:8; size:4; signed:1;
660 u32 act; // offset:12; size:4; signed:0;
661 int cpu; // offset:16; size:4; signed:1;
662 unsigned int drops; // offset:20; size:4; signed:0;
663 unsigned int processed; // offset:24; size:4; signed:0;
664 int to_cpu; // offset:28; size:4; signed:1;
667 SEC("tracepoint/xdp/xdp_cpumap_enqueue")
668 int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
670 u32 to_cpu = ctx->to_cpu;
673 if (to_cpu >= MAX_CPUS)
676 rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
679 rec->processed += ctx->processed;
680 rec->dropped += ctx->drops;
682 /* Record bulk events, then userspace can calc average bulk size */
683 if (ctx->processed > 0)
686 /* Inception: It's possible to detect overload situations, via
687 * this tracepoint. This can be used for creating a feedback
688 * loop to XDP, which can take appropriate actions to mitigate
689 * this overload situation.
694 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
695 * Code in: kernel/include/trace/events/xdp.h
697 struct cpumap_kthread_ctx {
698 u64 __pad; // First 8 bytes are not accessible
699 int map_id; // offset:8; size:4; signed:1;
700 u32 act; // offset:12; size:4; signed:0;
701 int cpu; // offset:16; size:4; signed:1;
702 unsigned int drops; // offset:20; size:4; signed:0;
703 unsigned int processed; // offset:24; size:4; signed:0;
704 int sched; // offset:28; size:4; signed:1;
705 unsigned int xdp_pass; // offset:32; size:4; signed:0;
706 unsigned int xdp_drop; // offset:36; size:4; signed:0;
707 unsigned int xdp_redirect; // offset:40; size:4; signed:0;
710 SEC("tracepoint/xdp/xdp_cpumap_kthread")
711 int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
716 rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
719 rec->processed += ctx->processed;
720 rec->dropped += ctx->drops;
721 rec->xdp_pass += ctx->xdp_pass;
722 rec->xdp_drop += ctx->xdp_drop;
723 rec->xdp_redirect += ctx->xdp_redirect;
725 /* Count times kthread yielded CPU via schedule call */