sched/tracing: Append prev_state to tp args instead
authorDelyan Kratunov <delyank@fb.com>
Wed, 11 May 2022 18:28:36 +0000 (18:28 +0000)
committerPeter Zijlstra <peterz@infradead.org>
Wed, 11 May 2022 22:37:11 +0000 (00:37 +0200)
Commit fa2c3254d7cf (sched/tracing: Don't re-read p->state when emitting
sched_switch event, 2022-01-20) added a new prev_state argument to the
sched_switch tracepoint, before the prev task_struct pointer.

This reordering of arguments broke BPF programs that use the raw
tracepoint (e.g. tp_btf programs). The type of the second argument has
changed and existing programs that assume a task_struct* argument
(e.g. for bpf_task_storage access) will now fail to verify.

If we instead append the new argument to the end, all existing programs
would continue to work and can conditionally extract the prev_state
argument on supported kernel versions.

Fixes: fa2c3254d7cf (sched/tracing: Don't re-read p->state when emitting sched_switch event, 2022-01-20)
Signed-off-by: Delyan Kratunov <delyank@fb.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lkml.kernel.org/r/c8a6930dfdd58a4a5755fc01732675472979732b.camel@fb.com
include/trace/events/sched.h
kernel/sched/core.c
kernel/trace/fgraph.c
kernel/trace/ftrace.c
kernel/trace/trace_events.c
kernel/trace/trace_osnoise.c
kernel/trace/trace_sched_switch.c
kernel/trace/trace_sched_wakeup.c
samples/trace_events/trace_custom_sched.h

index 65e7867..fbb99a6 100644 (file)
@@ -222,11 +222,11 @@ static inline long __trace_sched_switch_state(bool preempt,
 TRACE_EVENT(sched_switch,
 
        TP_PROTO(bool preempt,
-                unsigned int prev_state,
                 struct task_struct *prev,
-                struct task_struct *next),
+                struct task_struct *next,
+                unsigned int prev_state),
 
-       TP_ARGS(preempt, prev_state, prev, next),
+       TP_ARGS(preempt, prev, next, prev_state),
 
        TP_STRUCT__entry(
                __array(        char,   prev_comm,      TASK_COMM_LEN   )
index 51efaab..d58c038 100644 (file)
@@ -6382,7 +6382,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
                migrate_disable_switch(rq, prev);
                psi_sched_switch(prev, next, !task_on_rq_queued(prev));
 
-               trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next);
+               trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
 
                /* Also unlocks the rq: */
                rq = context_switch(rq, prev, next, &rf);
index 8f4fb32..a7e84c8 100644 (file)
@@ -404,9 +404,9 @@ free:
 
 static void
 ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
-                               unsigned int prev_state,
                                struct task_struct *prev,
-                               struct task_struct *next)
+                               struct task_struct *next,
+                               unsigned int prev_state)
 {
        unsigned long long timestamp;
        int index;
index 4f1d2f5..af899b0 100644 (file)
@@ -7420,9 +7420,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 
 static void
 ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
-                                    unsigned int prev_state,
                                     struct task_struct *prev,
-                                    struct task_struct *next)
+                                    struct task_struct *next,
+                                    unsigned int prev_state)
 {
        struct trace_array *tr = data;
        struct trace_pid_list *pid_list;
index e11e167..f97de82 100644 (file)
@@ -773,9 +773,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
 
 static void
 event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
-                                       unsigned int prev_state,
                                        struct task_struct *prev,
-                                       struct task_struct *next)
+                                       struct task_struct *next,
+                                       unsigned int prev_state)
 {
        struct trace_array *tr = data;
        struct trace_pid_list *no_pid_list;
@@ -799,9 +799,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
 
 static void
 event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
-                                        unsigned int prev_state,
                                         struct task_struct *prev,
-                                        struct task_struct *next)
+                                        struct task_struct *next,
+                                        unsigned int prev_state)
 {
        struct trace_array *tr = data;
        struct trace_pid_list *no_pid_list;
index e9ae1f3..afb92e2 100644 (file)
@@ -1168,9 +1168,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
  */
 static void
 trace_sched_switch_callback(void *data, bool preempt,
-                           unsigned int prev_state,
                            struct task_struct *p,
-                           struct task_struct *n)
+                           struct task_struct *n,
+                           unsigned int prev_state)
 {
        struct osnoise_variables *osn_var = this_cpu_osn_var();
 
index 45796d8..c9ffdcf 100644 (file)
@@ -22,8 +22,8 @@ static DEFINE_MUTEX(sched_register_mutex);
 
 static void
 probe_sched_switch(void *ignore, bool preempt,
-                  unsigned int prev_state,
-                  struct task_struct *prev, struct task_struct *next)
+                  struct task_struct *prev, struct task_struct *next,
+                  unsigned int prev_state)
 {
        int flags;
 
index 46429f9..330aee1 100644 (file)
@@ -426,8 +426,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 
 static void notrace
 probe_wakeup_sched_switch(void *ignore, bool preempt,
-                         unsigned int prev_state,
-                         struct task_struct *prev, struct task_struct *next)
+                         struct task_struct *prev, struct task_struct *next,
+                         unsigned int prev_state)
 {
        struct trace_array_cpu *data;
        u64 T0, T1, delta;
index 9fdd8e7..9513883 100644 (file)
@@ -25,11 +25,11 @@ TRACE_CUSTOM_EVENT(sched_switch,
         * that the custom event is using.
         */
        TP_PROTO(bool preempt,
-                unsigned int prev_state,
                 struct task_struct *prev,
-                struct task_struct *next),
+                struct task_struct *next,
+                unsigned int prev_state),
 
-       TP_ARGS(preempt, prev_state, prev, next),
+       TP_ARGS(preempt, prev, next, prev_state),
 
        /*
         * The next fields are where the customization happens.