1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2022 Google
4 #include <bpf/bpf_helpers.h>
5 #include <bpf/bpf_tracing.h>
6 #include <bpf/bpf_core_read.h>
8 /* task->flags for off-cpu analysis */
9 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */
11 /* task->state for off-cpu analysis */
12 #define TASK_INTERRUPTIBLE 0x0001
13 #define TASK_UNINTERRUPTIBLE 0x0002
16 #define MAX_ENTRIES 102400
33 __uint(type, BPF_MAP_TYPE_STACK_TRACE);
34 __uint(key_size, sizeof(__u32));
35 __uint(value_size, MAX_STACKS * sizeof(__u64));
36 __uint(max_entries, MAX_ENTRIES);
37 } stacks SEC(".maps");
40 __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
41 __uint(map_flags, BPF_F_NO_PREALLOC);
43 __type(value, struct tstamp_data);
44 } tstamp SEC(".maps");
47 __uint(type, BPF_MAP_TYPE_HASH);
48 __uint(key_size, sizeof(struct offcpu_key));
49 __uint(value_size, sizeof(__u64));
50 __uint(max_entries, MAX_ENTRIES);
51 } off_cpu SEC(".maps");
54 __uint(type, BPF_MAP_TYPE_HASH);
55 __uint(key_size, sizeof(__u32));
56 __uint(value_size, sizeof(__u8));
57 __uint(max_entries, 1);
58 } cpu_filter SEC(".maps");
61 __uint(type, BPF_MAP_TYPE_HASH);
62 __uint(key_size, sizeof(__u32));
63 __uint(value_size, sizeof(__u8));
64 __uint(max_entries, 1);
65 } task_filter SEC(".maps");
68 __uint(type, BPF_MAP_TYPE_HASH);
69 __uint(key_size, sizeof(__u64));
70 __uint(value_size, sizeof(__u8));
71 __uint(max_entries, 1);
72 } cgroup_filter SEC(".maps");
74 /* new kernel task_struct definition */
75 struct task_struct___new {
77 } __attribute__((preserve_access_index));
79 /* old kernel task_struct definition */
80 struct task_struct___old {
82 } __attribute__((preserve_access_index));
89 const volatile bool has_prev_state = false;
90 const volatile bool needs_cgroup = false;
91 const volatile bool uses_cgroup_v1 = false;
94 * Old kernel used to call it task_struct->state and now it's '__state'.
95 * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
97 * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
99 static inline int get_task_state(struct task_struct *t)
101 /* recast pointer to capture new type for compiler */
102 struct task_struct___new *t_new = (void *)t;
104 if (bpf_core_field_exists(t_new->__state)) {
105 return BPF_CORE_READ(t_new, __state);
107 /* recast pointer to capture old type for compiler */
108 struct task_struct___old *t_old = (void *)t;
110 return BPF_CORE_READ(t_old, state);
114 static inline __u64 get_cgroup_id(struct task_struct *t)
119 cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_event_cgrp_id], cgroup);
121 cgrp = BPF_CORE_READ(t, cgroups, dfl_cgrp);
123 return BPF_CORE_READ(cgrp, kn, id);
126 static inline int can_record(struct task_struct *t, int state)
128 /* kernel threads don't have user stack */
129 if (t->flags & PF_KTHREAD)
132 if (state != TASK_INTERRUPTIBLE &&
133 state != TASK_UNINTERRUPTIBLE)
137 __u32 cpu = bpf_get_smp_processor_id();
140 ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
149 ok = bpf_map_lookup_elem(&task_filter, &pid);
156 __u64 cgrp_id = get_cgroup_id(t);
158 ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
166 static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
167 struct task_struct *next, int state)
171 struct tstamp_data *pelem;
173 ts = bpf_ktime_get_ns();
175 if (!can_record(prev, state))
178 stack_id = bpf_get_stackid(ctx, &stacks,
179 BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
181 pelem = bpf_task_storage_get(&tstamp, prev, NULL,
182 BPF_LOCAL_STORAGE_GET_F_CREATE);
186 pelem->timestamp = ts;
187 pelem->state = state;
188 pelem->stack_id = stack_id;
191 pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
193 if (pelem && pelem->timestamp) {
194 struct offcpu_key key = {
197 .stack_id = pelem->stack_id,
198 .state = pelem->state,
199 .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
201 __u64 delta = ts - pelem->timestamp;
204 total = bpf_map_lookup_elem(&off_cpu, &key);
208 bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
210 /* prevent to reuse the timestamp later */
211 pelem->timestamp = 0;
217 SEC("tp_btf/sched_switch")
218 int on_switch(u64 *ctx)
220 struct task_struct *prev, *next;
226 prev = (struct task_struct *)ctx[1];
227 next = (struct task_struct *)ctx[2];
230 prev_state = (int)ctx[3];
232 prev_state = get_task_state(prev);
234 return off_cpu_stat(ctx, prev, next, prev_state);
237 char LICENSE[] SEC("license") = "Dual BSD/GPL";