perf stat: Support inherit events during fork() for bperf

author Tengda Wu <wutengda@huaweicloud.com>

Mon, 21 Oct 2024 11:02:00 +0000 (11:02 +0000)

committer Namhyung Kim <namhyung@kernel.org>

Sat, 2 Nov 2024 06:31:08 +0000 (23:31 -0700)
author Tengda Wu <wutengda@huaweicloud.com>
Mon, 21 Oct 2024 11:02:00 +0000 (11:02 +0000)
committer Namhyung Kim <namhyung@kernel.org>
Sat, 2 Nov 2024 06:31:08 +0000 (23:31 -0700)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c

index c12158d..fdf5172 100644 (file)
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -2641,6 +2641,7 @@ int cmd_stat(int argc, const char **argv)
         } else if (big_num_opt == 0) /* User passed --no-big-num */
                 stat_config.big_num = false;
  
+       target.inherit = !stat_config.no_inherit;
         err = target__validate(&target);
         if (err) {
                 target__strerror(&target, err, errbuf, BUFSIZ);
diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c

index 7a8af60..73fcafb 100644 (file)
--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -394,6 +394,7 @@ static int bperf_check_target(struct evsel *evsel,
  }
  
  static struct perf_cpu_map *all_cpu_map;
+static __u32 filter_entry_cnt;
  
  static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
                                        struct perf_event_attr_map_entry *entry)
@@ -444,12 +445,32 @@ out:
         return err;
  }
  
+static int bperf_attach_follower_program(struct bperf_follower_bpf *skel,
+                                        enum bperf_filter_type filter_type,
+                                        bool inherit)
+{
+       struct bpf_link *link;
+       int err = 0;
+
+       if ((filter_type == BPERF_FILTER_PID ||
+           filter_type == BPERF_FILTER_TGID) && inherit)
+               /* attach all follower bpf progs to enable event inheritance */
+               err = bperf_follower_bpf__attach(skel);
+       else {
+               link = bpf_program__attach(skel->progs.fexit_XXX);
+               if (IS_ERR(link))
+                       err = PTR_ERR(link);
+       }
+
+       return err;
+}
+
  static int bperf__load(struct evsel *evsel, struct target *target)
  {
         struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff};
         int attr_map_fd, diff_map_fd = -1, err;
         enum bperf_filter_type filter_type;
-       __u32 filter_entry_cnt, i;
+       __u32 i;
  
         if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt))
                 return -1;
@@ -529,9 +550,6 @@ static int bperf__load(struct evsel *evsel, struct target *target)
         /* set up reading map */
         bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings,
                                  filter_entry_cnt);
-       /* set up follower filter based on target */
-       bpf_map__set_max_entries(evsel->follower_skel->maps.filter,
-                                filter_entry_cnt);
         err = bperf_follower_bpf__load(evsel->follower_skel);
         if (err) {
                 pr_err("Failed to load follower skeleton\n");
@@ -543,6 +561,7 @@ static int bperf__load(struct evsel *evsel, struct target *target)
         for (i = 0; i < filter_entry_cnt; i++) {
                 int filter_map_fd;
                 __u32 key;
+               struct bperf_filter_value fval = { i, 0 };
  
                 if (filter_type == BPERF_FILTER_PID ||
                     filter_type == BPERF_FILTER_TGID)
@@ -553,12 +572,14 @@ static int bperf__load(struct evsel *evsel, struct target *target)
                         break;
  
                 filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter);
-               bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY);
+               bpf_map_update_elem(filter_map_fd, &key, &fval, BPF_ANY);
         }
  
         evsel->follower_skel->bss->type = filter_type;
+       evsel->follower_skel->bss->inherit = target->inherit;
  
-       err = bperf_follower_bpf__attach(evsel->follower_skel);
+       err = bperf_attach_follower_program(evsel->follower_skel, filter_type,
+                                           target->inherit);
  
  out:
         if (err && evsel->bperf_leader_link_fd >= 0)
@@ -623,7 +644,7 @@ static int bperf__read(struct evsel *evsel)
         bperf_sync_counters(evsel);
         reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
  
-       for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) {
+       for (i = 0; i < filter_entry_cnt; i++) {
                 struct perf_cpu entry;
                 __u32 cpu;
  
diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c

index f193998..0595063 100644 (file)
--- a/tools/perf/util/bpf_skel/bperf_follower.bpf.c
+++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
@@ -5,6 +5,8 @@
  #include <bpf/bpf_tracing.h>
  #include "bperf_u.h"
  
+#define MAX_ENTRIES 102400
+
  struct {
         __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
         __uint(key_size, sizeof(__u32));
@@ -22,25 +24,29 @@ struct {
  struct {
         __uint(type, BPF_MAP_TYPE_HASH);
         __uint(key_size, sizeof(__u32));
-       __uint(value_size, sizeof(__u32));
+       __uint(value_size, sizeof(struct bperf_filter_value));
+       __uint(max_entries, MAX_ENTRIES);
+       __uint(map_flags, BPF_F_NO_PREALLOC);
  } filter SEC(".maps");
  
  enum bperf_filter_type type = 0;
  int enabled = 0;
+int inherit;
  
  SEC("fexit/XXX")
  int BPF_PROG(fexit_XXX)
  {
         struct bpf_perf_event_value *diff_val, *accum_val;
         __u32 filter_key, zero = 0;
-       __u32 *accum_key;
+       __u32 accum_key;
+       struct bperf_filter_value *fval;
  
         if (!enabled)
                 return 0;
  
         switch (type) {
         case BPERF_FILTER_GLOBAL:
-               accum_key = &zero;
+               accum_key = zero;
                 goto do_add;
         case BPERF_FILTER_CPU:
                 filter_key = bpf_get_smp_processor_id();
@@ -49,22 +55,34 @@ int BPF_PROG(fexit_XXX)
                 filter_key = bpf_get_current_pid_tgid() & 0xffffffff;
                 break;
         case BPERF_FILTER_TGID:
-               filter_key = bpf_get_current_pid_tgid() >> 32;
+               /* Use pid as the filter_key to exclude new task counts
+                * when inherit is disabled. Don't worry about the existing
+                * children in TGID losing their counts, bpf_counter has
+                * already added them to the filter map via perf_thread_map
+                * before this bpf prog runs.
+                */
+               filter_key = inherit ?
+                            bpf_get_current_pid_tgid() >> 32 :
+                            bpf_get_current_pid_tgid() & 0xffffffff;
                 break;
         default:
                 return 0;
         }
  
-       accum_key = bpf_map_lookup_elem(&filter, &filter_key);
-       if (!accum_key)
+       fval = bpf_map_lookup_elem(&filter, &filter_key);
+       if (!fval)
                 return 0;
  
+       accum_key = fval->accum_key;
+       if (fval->exited)
+               bpf_map_delete_elem(&filter, &filter_key);
+
  do_add:
         diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
         if (!diff_val)
                 return 0;
  
-       accum_val = bpf_map_lookup_elem(&accum_readings, accum_key);
+       accum_val = bpf_map_lookup_elem(&accum_readings, &accum_key);
         if (!accum_val)
                 return 0;
  
@@ -75,4 +93,70 @@ do_add:
         return 0;
  }
  
+/* The program is only used for PID or TGID filter types. */
+SEC("tp_btf/task_newtask")
+int BPF_PROG(on_newtask, struct task_struct *task, __u64 clone_flags)
+{
+       __u32 parent_key, child_key;
+       struct bperf_filter_value *parent_fval;
+       struct bperf_filter_value child_fval = { 0 };
+
+       if (!enabled)
+               return 0;
+
+       switch (type) {
+       case BPERF_FILTER_PID:
+               parent_key = bpf_get_current_pid_tgid() & 0xffffffff;
+               child_key = task->pid;
+               break;
+       case BPERF_FILTER_TGID:
+               parent_key = bpf_get_current_pid_tgid() >> 32;
+               child_key = task->tgid;
+               if (child_key == parent_key)
+                       return 0;
+               break;
+       default:
+               return 0;
+       }
+
+       /* Check if the current task is one of the target tasks to be counted */
+       parent_fval = bpf_map_lookup_elem(&filter, &parent_key);
+       if (!parent_fval)
+               return 0;
+
+       /* Start counting for the new task by adding it into filter map,
+        * inherit the accum key of its parent task so that they can be
+        * counted together.
+        */
+       child_fval.accum_key = parent_fval->accum_key;
+       child_fval.exited = 0;
+       bpf_map_update_elem(&filter, &child_key, &child_fval, BPF_NOEXIST);
+
+       return 0;
+}
+
+/* The program is only used for PID or TGID filter types. */
+SEC("tp_btf/sched_process_exit")
+int BPF_PROG(on_exittask, struct task_struct *task)
+{
+       __u32 pid;
+       struct bperf_filter_value *fval;
+
+       if (!enabled)
+               return 0;
+
+       /* Stop counting for this task by removing it from filter map.
+        * For TGID type, if the pid can be found in the map, it means that
+        * this pid belongs to the leader task. After the task exits, the
+        * tgid of its child tasks (if any) will be 1, so the pid can be
+        * safely removed.
+        */
+       pid = task->pid;
+       fval = bpf_map_lookup_elem(&filter, &pid);
+       if (fval)
+               fval->exited = 1;
+
+       return 0;
+}
+
  char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/bperf_u.h b/tools/perf/util/bpf_skel/bperf_u.h

index 1ce0c2c..4a4a753 100644 (file)
--- a/tools/perf/util/bpf_skel/bperf_u.h
+++ b/tools/perf/util/bpf_skel/bperf_u.h
@@ -11,4 +11,9 @@ enum bperf_filter_type {
         BPERF_FILTER_TGID,
  };
  
+struct bperf_filter_value {
+       __u32 accum_key;
+       __u8 exited;
+};
+
  #endif /* __BPERF_STAT_U_H */
diff --git a/tools/perf/util/target.h b/tools/perf/util/target.h

index d582cae..2ee2cc3 100644 (file)
--- a/tools/perf/util/target.h
+++ b/tools/perf/util/target.h
@@ -17,6 +17,7 @@ struct target {
         bool         default_per_cpu;
         bool         per_thread;
         bool         use_bpf;
+       bool         inherit;
         int          initial_delay;
         const char   *attr_map;
  };
author	Tengda Wu <wutengda@huaweicloud.com>
	Mon, 21 Oct 2024 11:02:00 +0000 (11:02 +0000)
committer	Namhyung Kim <namhyung@kernel.org>
	Sat, 2 Nov 2024 06:31:08 +0000 (23:31 -0700)
tools/perf/builtin-stat.c		patch \| blob \| history
tools/perf/util/bpf_counter.c		patch \| blob \| history
tools/perf/util/bpf_skel/bperf_follower.bpf.c		patch \| blob \| history
tools/perf/util/bpf_skel/bperf_u.h		patch \| blob \| history
tools/perf/util/target.h		patch \| blob \| history