1 /* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
3 * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
5 #include <uapi/linux/if_ether.h>
6 #include <uapi/linux/if_packet.h>
7 #include <uapi/linux/if_vlan.h>
8 #include <uapi/linux/ip.h>
9 #include <uapi/linux/ipv6.h>
10 #include <uapi/linux/in.h>
11 #include <uapi/linux/tcp.h>
12 #include <uapi/linux/udp.h>
14 #include <uapi/linux/bpf.h>
15 #include <bpf/bpf_helpers.h>
16 #include "hash_func01.h"
18 #define MAX_CPUS NR_CPUS
20 /* Special map type that can XDP_REDIRECT frames to another CPU */
22 __uint(type, BPF_MAP_TYPE_CPUMAP);
23 __uint(key_size, sizeof(u32));
24 __uint(value_size, sizeof(u32));
25 __uint(max_entries, MAX_CPUS);
26 } cpu_map SEC(".maps");
28 /* Common stats data record to keep userspace more simple */
35 /* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
36 * feedback. Redirect TX errors can be caught via a tracepoint.
39 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
41 __type(value, struct datarec);
42 __uint(max_entries, 1);
43 } rx_cnt SEC(".maps");
45 /* Used by trace point */
47 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
49 __type(value, struct datarec);
50 __uint(max_entries, 2);
51 /* TODO: have entries for all possible errno's */
52 } redirect_err_cnt SEC(".maps");
54 /* Used by trace point */
56 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
58 __type(value, struct datarec);
59 __uint(max_entries, MAX_CPUS);
60 } cpumap_enqueue_cnt SEC(".maps");
62 /* Used by trace point */
64 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
66 __type(value, struct datarec);
67 __uint(max_entries, 1);
68 } cpumap_kthread_cnt SEC(".maps");
70 /* Set of maps controlling available CPU, and for iterating through
71 * selectable redirect CPUs.
74 __uint(type, BPF_MAP_TYPE_ARRAY);
77 __uint(max_entries, MAX_CPUS);
78 } cpus_available SEC(".maps");
80 __uint(type, BPF_MAP_TYPE_ARRAY);
83 __uint(max_entries, 1);
84 } cpus_count SEC(".maps");
86 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
89 __uint(max_entries, 1);
90 } cpus_iterator SEC(".maps");
92 /* Used by trace point */
94 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
96 __type(value, struct datarec);
97 __uint(max_entries, 1);
98 } exception_cnt SEC(".maps");
100 /* Helper parse functions */
102 /* Parse Ethernet layer 2, extract network layer 3 offset and protocol
104 * Returns false on error and non-supported ether-type
108 __be16 h_vlan_encapsulated_proto;
111 static __always_inline
112 bool parse_eth(struct ethhdr *eth, void *data_end,
113 u16 *eth_proto, u64 *l3_offset)
118 offset = sizeof(*eth);
119 if ((void *)eth + offset > data_end)
122 eth_type = eth->h_proto;
124 /* Skip non 802.3 Ethertypes */
125 if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN))
128 /* Handle VLAN tagged packet */
129 if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
130 struct vlan_hdr *vlan_hdr;
132 vlan_hdr = (void *)eth + offset;
133 offset += sizeof(*vlan_hdr);
134 if ((void *)eth + offset > data_end)
136 eth_type = vlan_hdr->h_vlan_encapsulated_proto;
138 /* Handle double VLAN tagged packet */
139 if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
140 struct vlan_hdr *vlan_hdr;
142 vlan_hdr = (void *)eth + offset;
143 offset += sizeof(*vlan_hdr);
144 if ((void *)eth + offset > data_end)
146 eth_type = vlan_hdr->h_vlan_encapsulated_proto;
149 *eth_proto = ntohs(eth_type);
154 static __always_inline
155 u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
157 void *data_end = (void *)(long)ctx->data_end;
158 void *data = (void *)(long)ctx->data;
159 struct iphdr *iph = data + nh_off;
163 if (iph + 1 > data_end)
165 if (!(iph->protocol == IPPROTO_UDP))
168 udph = (void *)(iph + 1);
169 if (udph + 1 > data_end)
172 dport = ntohs(udph->dest);
176 static __always_inline
177 int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off)
179 void *data_end = (void *)(long)ctx->data_end;
180 void *data = (void *)(long)ctx->data;
181 struct iphdr *iph = data + nh_off;
183 if (iph + 1 > data_end)
185 return iph->protocol;
188 static __always_inline
189 int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off)
191 void *data_end = (void *)(long)ctx->data_end;
192 void *data = (void *)(long)ctx->data;
193 struct ipv6hdr *ip6h = data + nh_off;
195 if (ip6h + 1 > data_end)
197 return ip6h->nexthdr;
201 int xdp_prognum0_no_touch(struct xdp_md *ctx)
203 void *data_end = (void *)(long)ctx->data_end;
204 void *data = (void *)(long)ctx->data;
210 /* Only use first entry in cpus_available */
211 cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
214 cpu_dest = *cpu_selected;
216 /* Count RX packet in map */
217 rec = bpf_map_lookup_elem(&rx_cnt, &key);
222 if (cpu_dest >= MAX_CPUS) {
227 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
230 SEC("xdp_cpu_map1_touch_data")
231 int xdp_prognum1_touch_data(struct xdp_md *ctx)
233 void *data_end = (void *)(long)ctx->data_end;
234 void *data = (void *)(long)ctx->data;
235 struct ethhdr *eth = data;
242 /* Only use first entry in cpus_available */
243 cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
246 cpu_dest = *cpu_selected;
248 /* Validate packet length is minimum Eth header size */
249 if (eth + 1 > data_end)
252 /* Count RX packet in map */
253 rec = bpf_map_lookup_elem(&rx_cnt, &key);
258 /* Read packet data, and use it (drop non 802.3 Ethertypes) */
259 eth_type = eth->h_proto;
260 if (ntohs(eth_type) < ETH_P_802_3_MIN) {
265 if (cpu_dest >= MAX_CPUS) {
270 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
273 SEC("xdp_cpu_map2_round_robin")
274 int xdp_prognum2_round_robin(struct xdp_md *ctx)
276 void *data_end = (void *)(long)ctx->data_end;
277 void *data = (void *)(long)ctx->data;
278 struct ethhdr *eth = data;
289 cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
293 cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0);
296 cpu_idx = *cpu_iterator;
299 if (*cpu_iterator == *cpu_max)
302 cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
305 cpu_dest = *cpu_selected;
307 /* Count RX packet in map */
308 rec = bpf_map_lookup_elem(&rx_cnt, &key0);
313 if (cpu_dest >= MAX_CPUS) {
318 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
321 SEC("xdp_cpu_map3_proto_separate")
322 int xdp_prognum3_proto_separate(struct xdp_md *ctx)
324 void *data_end = (void *)(long)ctx->data_end;
325 void *data = (void *)(long)ctx->data;
326 struct ethhdr *eth = data;
327 u8 ip_proto = IPPROTO_UDP;
336 /* Count RX packet in map */
337 rec = bpf_map_lookup_elem(&rx_cnt, &key);
342 if (!(parse_eth(eth, data_end, ð_proto, &l3_offset)))
343 return XDP_PASS; /* Just skip */
345 /* Extract L4 protocol */
348 ip_proto = get_proto_ipv4(ctx, l3_offset);
351 ip_proto = get_proto_ipv6(ctx, l3_offset);
354 cpu_idx = 0; /* ARP packet handled on separate CPU */
360 /* Choose CPU based on L4 protocol */
376 cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
379 cpu_dest = *cpu_lookup;
381 if (cpu_dest >= MAX_CPUS) {
386 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
389 SEC("xdp_cpu_map4_ddos_filter_pktgen")
390 int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
392 void *data_end = (void *)(long)ctx->data_end;
393 void *data = (void *)(long)ctx->data;
394 struct ethhdr *eth = data;
395 u8 ip_proto = IPPROTO_UDP;
405 /* Count RX packet in map */
406 rec = bpf_map_lookup_elem(&rx_cnt, &key);
411 if (!(parse_eth(eth, data_end, ð_proto, &l3_offset)))
412 return XDP_PASS; /* Just skip */
414 /* Extract L4 protocol */
417 ip_proto = get_proto_ipv4(ctx, l3_offset);
420 ip_proto = get_proto_ipv6(ctx, l3_offset);
423 cpu_idx = 0; /* ARP packet handled on separate CPU */
429 /* Choose CPU based on L4 protocol */
440 /* DDoS filter UDP port 9 (pktgen) */
441 dest_port = get_dest_port_ipv4_udp(ctx, l3_offset);
442 if (dest_port == 9) {
452 cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
455 cpu_dest = *cpu_lookup;
457 if (cpu_dest >= MAX_CPUS) {
462 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
465 /* Hashing initval */
466 #define INITVAL 15485863
468 static __always_inline
469 u32 get_ipv4_hash_ip_pair(struct xdp_md *ctx, u64 nh_off)
471 void *data_end = (void *)(long)ctx->data_end;
472 void *data = (void *)(long)ctx->data;
473 struct iphdr *iph = data + nh_off;
476 if (iph + 1 > data_end)
479 cpu_hash = iph->saddr + iph->daddr;
480 cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol);
485 static __always_inline
486 u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off)
488 void *data_end = (void *)(long)ctx->data_end;
489 void *data = (void *)(long)ctx->data;
490 struct ipv6hdr *ip6h = data + nh_off;
493 if (ip6h + 1 > data_end)
496 cpu_hash = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0];
497 cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1];
498 cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2];
499 cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3];
500 cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr);
505 /* Load-Balance traffic based on hashing IP-addrs + L4-proto. The
506 * hashing scheme is symmetric, meaning swapping IP src/dest still hit
509 SEC("xdp_cpu_map5_lb_hash_ip_pairs")
510 int xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx)
512 void *data_end = (void *)(long)ctx->data_end;
513 void *data = (void *)(long)ctx->data;
514 struct ethhdr *eth = data;
515 u8 ip_proto = IPPROTO_UDP;
526 /* Count RX packet in map */
527 rec = bpf_map_lookup_elem(&rx_cnt, &key);
532 cpu_max = bpf_map_lookup_elem(&cpus_count, &key);
536 if (!(parse_eth(eth, data_end, ð_proto, &l3_offset)))
537 return XDP_PASS; /* Just skip */
539 /* Hash for IPv4 and IPv6 */
542 cpu_hash = get_ipv4_hash_ip_pair(ctx, l3_offset);
545 cpu_hash = get_ipv6_hash_ip_pair(ctx, l3_offset);
547 case ETH_P_ARP: /* ARP packet handled on CPU idx 0 */
552 /* Choose CPU based on hash */
553 cpu_idx = cpu_hash % *cpu_max;
555 cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
558 cpu_dest = *cpu_lookup;
560 if (cpu_dest >= MAX_CPUS) {
565 return bpf_redirect_map(&cpu_map, cpu_dest, 0);
568 char _license[] SEC("license") = "GPL";
570 /*** Trace point code ***/
572 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
573 * Code in: kernel/include/trace/events/xdp.h
575 struct xdp_redirect_ctx {
576 u64 __pad; // First 8 bytes are not accessible by bpf code
577 int prog_id; // offset:8; size:4; signed:1;
578 u32 act; // offset:12 size:4; signed:0;
579 int ifindex; // offset:16 size:4; signed:1;
580 int err; // offset:20 size:4; signed:1;
581 int to_ifindex; // offset:24 size:4; signed:1;
582 u32 map_id; // offset:28 size:4; signed:0;
583 int map_index; // offset:32 size:4; signed:1;
587 XDP_REDIRECT_SUCCESS = 0,
588 XDP_REDIRECT_ERROR = 1
591 static __always_inline
592 int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
594 u32 key = XDP_REDIRECT_ERROR;
599 key = XDP_REDIRECT_SUCCESS;
601 rec = bpf_map_lookup_elem(&redirect_err_cnt, &key);
606 return 0; /* Indicate event was filtered (no further processing)*/
608 * Returning 1 here would allow e.g. a perf-record tracepoint
609 * to see and record these events, but it doesn't work well
610 * in-practice as stopping perf-record also unload this
611 * bpf_prog. Plus, there is additional overhead of doing so.
615 SEC("tracepoint/xdp/xdp_redirect_err")
616 int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
618 return xdp_redirect_collect_stat(ctx);
621 SEC("tracepoint/xdp/xdp_redirect_map_err")
622 int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
624 return xdp_redirect_collect_stat(ctx);
627 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
628 * Code in: kernel/include/trace/events/xdp.h
630 struct xdp_exception_ctx {
631 u64 __pad; // First 8 bytes are not accessible by bpf code
632 int prog_id; // offset:8; size:4; signed:1;
633 u32 act; // offset:12; size:4; signed:0;
634 int ifindex; // offset:16; size:4; signed:1;
637 SEC("tracepoint/xdp/xdp_exception")
638 int trace_xdp_exception(struct xdp_exception_ctx *ctx)
643 rec = bpf_map_lookup_elem(&exception_cnt, &key);
651 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
652 * Code in: kernel/include/trace/events/xdp.h
654 struct cpumap_enqueue_ctx {
655 u64 __pad; // First 8 bytes are not accessible by bpf code
656 int map_id; // offset:8; size:4; signed:1;
657 u32 act; // offset:12; size:4; signed:0;
658 int cpu; // offset:16; size:4; signed:1;
659 unsigned int drops; // offset:20; size:4; signed:0;
660 unsigned int processed; // offset:24; size:4; signed:0;
661 int to_cpu; // offset:28; size:4; signed:1;
664 SEC("tracepoint/xdp/xdp_cpumap_enqueue")
665 int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
667 u32 to_cpu = ctx->to_cpu;
670 if (to_cpu >= MAX_CPUS)
673 rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
676 rec->processed += ctx->processed;
677 rec->dropped += ctx->drops;
679 /* Record bulk events, then userspace can calc average bulk size */
680 if (ctx->processed > 0)
683 /* Inception: It's possible to detect overload situations, via
684 * this tracepoint. This can be used for creating a feedback
685 * loop to XDP, which can take appropriate actions to mitigate
686 * this overload situation.
691 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
692 * Code in: kernel/include/trace/events/xdp.h
694 struct cpumap_kthread_ctx {
695 u64 __pad; // First 8 bytes are not accessible by bpf code
696 int map_id; // offset:8; size:4; signed:1;
697 u32 act; // offset:12; size:4; signed:0;
698 int cpu; // offset:16; size:4; signed:1;
699 unsigned int drops; // offset:20; size:4; signed:0;
700 unsigned int processed; // offset:24; size:4; signed:0;
701 int sched; // offset:28; size:4; signed:1;
704 SEC("tracepoint/xdp/xdp_cpumap_kthread")
705 int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
710 rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
713 rec->processed += ctx->processed;
714 rec->dropped += ctx->drops;
716 /* Count times kthread yielded CPU via schedule call */