selftests/bpf: Add benchmark for local_storage RCU Tasks Trace usage
authorDave Marchevsky <davemarchevsky@fb.com>
Tue, 5 Jul 2022 19:00:18 +0000 (12:00 -0700)
committerDaniel Borkmann <daniel@iogearbox.net>
Thu, 7 Jul 2022 14:35:21 +0000 (16:35 +0200)
This benchmark measures grace period latency and kthread cpu usage of
RCU Tasks Trace when many processes are creating/deleting BPF
local_storage. Intent here is to quantify improvement on these metrics
after Paul's recent RCU Tasks patches [0].

Specifically, fork 15k tasks which call a bpf prog that creates/destroys
task local_storage and sleep in a loop, resulting in many
call_rcu_tasks_trace calls.

To determine grace period latency, trace time elapsed between
rcu_tasks_trace_pregp_step and rcu_tasks_trace_postgp; for cpu usage
look at rcu_task_trace_kthread's stime in /proc/PID/stat.

On my virtualized test environment (Skylake, 8 cpus) benchmark results
demonstrate significant improvement:

BEFORE Paul's patches:

  SUMMARY tasks_trace grace period latency        avg 22298.551 us stddev 1302.165 us
  SUMMARY ticks per tasks_trace grace period      avg 2.291 stddev 0.324

AFTER Paul's patches:

  SUMMARY tasks_trace grace period latency        avg 16969.197 us  stddev 2525.053 us
  SUMMARY ticks per tasks_trace grace period      avg 1.146 stddev 0.178

Note that since these patches are not in bpf-next benchmarking was done
by cherry-picking this patch onto rcu tree.

  [0] https://lore.kernel.org/rcu/20220620225402.GA3842369@paulmck-ThinkPad-P17-Gen-1/

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220705190018.3239050-1-davemarchevsky@fb.com
tools/testing/selftests/bpf/Makefile
tools/testing/selftests/bpf/bench.c
tools/testing/selftests/bpf/bench.h
tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c [new file with mode: 0644]
tools/testing/selftests/bpf/benchs/run_bench_local_storage_rcu_tasks_trace.sh [new file with mode: 0755]
tools/testing/selftests/bpf/progs/local_storage_rcu_tasks_trace_bench.c [new file with mode: 0644]

index e32a28f..dfaac97 100644 (file)
@@ -574,6 +574,7 @@ $(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h
 $(OUTPUT)/bench_strncmp.o: $(OUTPUT)/strncmp_bench.skel.h
 $(OUTPUT)/bench_bpf_hashmap_full_update.o: $(OUTPUT)/bpf_hashmap_full_update_bench.skel.h
 $(OUTPUT)/bench_local_storage.o: $(OUTPUT)/local_storage_bench.skel.h
+$(OUTPUT)/bench_local_storage_rcu_tasks_trace.o: $(OUTPUT)/local_storage_rcu_tasks_trace_bench.skel.h
 $(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
 $(OUTPUT)/bench: LDLIBS += -lm
 $(OUTPUT)/bench: $(OUTPUT)/bench.o \
@@ -587,7 +588,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
                 $(OUTPUT)/bench_bpf_loop.o \
                 $(OUTPUT)/bench_strncmp.o \
                 $(OUTPUT)/bench_bpf_hashmap_full_update.o \
-                $(OUTPUT)/bench_local_storage.o
+                $(OUTPUT)/bench_local_storage.o \
+                $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o
        $(call msg,BINARY,,$@)
        $(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
 
index 1e7b5d4..c1f20a1 100644 (file)
@@ -79,6 +79,43 @@ void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns)
               hits_per_sec, hits_per_prod, drops_per_sec, hits_per_sec + drops_per_sec);
 }
 
+void
+grace_period_latency_basic_stats(struct bench_res res[], int res_cnt, struct basic_stats *gp_stat)
+{
+       int i;
+
+       memset(gp_stat, 0, sizeof(struct basic_stats));
+
+       for (i = 0; i < res_cnt; i++)
+               gp_stat->mean += res[i].gp_ns / 1000.0 / (double)res[i].gp_ct / (0.0 + res_cnt);
+
+#define IT_MEAN_DIFF (res[i].gp_ns / 1000.0 / (double)res[i].gp_ct - gp_stat->mean)
+       if (res_cnt > 1) {
+               for (i = 0; i < res_cnt; i++)
+                       gp_stat->stddev += (IT_MEAN_DIFF * IT_MEAN_DIFF) / (res_cnt - 1.0);
+       }
+       gp_stat->stddev = sqrt(gp_stat->stddev);
+#undef IT_MEAN_DIFF
+}
+
+void
+grace_period_ticks_basic_stats(struct bench_res res[], int res_cnt, struct basic_stats *gp_stat)
+{
+       int i;
+
+       memset(gp_stat, 0, sizeof(struct basic_stats));
+       for (i = 0; i < res_cnt; i++)
+               gp_stat->mean += res[i].stime / (double)res[i].gp_ct / (0.0 + res_cnt);
+
+#define IT_MEAN_DIFF (res[i].stime / (double)res[i].gp_ct - gp_stat->mean)
+       if (res_cnt > 1) {
+               for (i = 0; i < res_cnt; i++)
+                       gp_stat->stddev += (IT_MEAN_DIFF * IT_MEAN_DIFF) / (res_cnt - 1.0);
+       }
+       gp_stat->stddev = sqrt(gp_stat->stddev);
+#undef IT_MEAN_DIFF
+}
+
 void hits_drops_report_final(struct bench_res res[], int res_cnt)
 {
        int i;
@@ -236,6 +273,7 @@ extern struct argp bench_ringbufs_argp;
 extern struct argp bench_bloom_map_argp;
 extern struct argp bench_bpf_loop_argp;
 extern struct argp bench_local_storage_argp;
+extern struct argp bench_local_storage_rcu_tasks_trace_argp;
 extern struct argp bench_strncmp_argp;
 
 static const struct argp_child bench_parsers[] = {
@@ -244,6 +282,8 @@ static const struct argp_child bench_parsers[] = {
        { &bench_bpf_loop_argp, 0, "bpf_loop helper benchmark", 0 },
        { &bench_local_storage_argp, 0, "local_storage benchmark", 0 },
        { &bench_strncmp_argp, 0, "bpf_strncmp helper benchmark", 0 },
+       { &bench_local_storage_rcu_tasks_trace_argp, 0,
+               "local_storage RCU Tasks Trace slowdown benchmark", 0 },
        {},
 };
 
@@ -449,6 +489,7 @@ extern const struct bench bench_bpf_hashmap_full_update;
 extern const struct bench bench_local_storage_cache_seq_get;
 extern const struct bench bench_local_storage_cache_interleaved_get;
 extern const struct bench bench_local_storage_cache_hashmap_control;
+extern const struct bench bench_local_storage_tasks_trace;
 
 static const struct bench *benchs[] = {
        &bench_count_global,
@@ -487,6 +528,7 @@ static const struct bench *benchs[] = {
        &bench_local_storage_cache_seq_get,
        &bench_local_storage_cache_interleaved_get,
        &bench_local_storage_cache_hashmap_control,
+       &bench_local_storage_tasks_trace,
 };
 
 static void setup_benchmark()
index 4b15286..d748255 100644 (file)
@@ -30,11 +30,19 @@ struct env {
        struct cpu_set cons_cpus;
 };
 
+struct basic_stats {
+       double mean;
+       double stddev;
+};
+
 struct bench_res {
        long hits;
        long drops;
        long false_hits;
        long important_hits;
+       unsigned long gp_ns;
+       unsigned long gp_ct;
+       unsigned int stime;
 };
 
 struct bench {
@@ -65,6 +73,10 @@ void ops_report_final(struct bench_res res[], int res_cnt);
 void local_storage_report_progress(int iter, struct bench_res *res,
                                   long delta_ns);
 void local_storage_report_final(struct bench_res res[], int res_cnt);
+void grace_period_latency_basic_stats(struct bench_res res[], int res_cnt,
+                                     struct basic_stats *gp_stat);
+void grace_period_ticks_basic_stats(struct bench_res res[], int res_cnt,
+                                   struct basic_stats *gp_stat);
 
 static inline __u64 get_time_ns(void)
 {
diff --git a/tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c b/tools/testing/selftests/bpf/benchs/bench_local_storage_rcu_tasks_trace.c
new file mode 100644 (file)
index 0000000..43f109d
--- /dev/null
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+
+#include <argp.h>
+
+#include <sys/prctl.h>
+#include "local_storage_rcu_tasks_trace_bench.skel.h"
+#include "bench.h"
+
+#include <signal.h>
+
+static struct {
+       __u32 nr_procs;
+       __u32 kthread_pid;
+       bool quiet;
+} args = {
+       .nr_procs = 1000,
+       .kthread_pid = 0,
+       .quiet = false,
+};
+
+enum {
+       ARG_NR_PROCS = 7000,
+       ARG_KTHREAD_PID = 7001,
+       ARG_QUIET = 7002,
+};
+
+static const struct argp_option opts[] = {
+       { "nr_procs", ARG_NR_PROCS, "NR_PROCS", 0,
+               "Set number of user processes to spin up"},
+       { "kthread_pid", ARG_KTHREAD_PID, "PID", 0,
+               "Pid of rcu_tasks_trace kthread for ticks tracking"},
+       { "quiet", ARG_QUIET, "{0,1}", 0,
+               "If true, don't report progress"},
+       {},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+       long ret;
+
+       switch (key) {
+       case ARG_NR_PROCS:
+               ret = strtol(arg, NULL, 10);
+               if (ret < 1 || ret > UINT_MAX) {
+                       fprintf(stderr, "invalid nr_procs\n");
+                       argp_usage(state);
+               }
+               args.nr_procs = ret;
+               break;
+       case ARG_KTHREAD_PID:
+               ret = strtol(arg, NULL, 10);
+               if (ret < 1) {
+                       fprintf(stderr, "invalid kthread_pid\n");
+                       argp_usage(state);
+               }
+               args.kthread_pid = ret;
+               break;
+       case ARG_QUIET:
+               ret = strtol(arg, NULL, 10);
+               if (ret < 0 || ret > 1) {
+                       fprintf(stderr, "invalid quiet %ld\n", ret);
+                       argp_usage(state);
+               }
+               args.quiet = ret;
+               break;
+break;
+       default:
+               return ARGP_ERR_UNKNOWN;
+       }
+
+       return 0;
+}
+
+const struct argp bench_local_storage_rcu_tasks_trace_argp = {
+       .options = opts,
+       .parser = parse_arg,
+};
+
+#define MAX_SLEEP_PROCS 150000
+
+static void validate(void)
+{
+       if (env.producer_cnt != 1) {
+               fprintf(stderr, "benchmark doesn't support multi-producer!\n");
+               exit(1);
+       }
+       if (env.consumer_cnt != 1) {
+               fprintf(stderr, "benchmark doesn't support multi-consumer!\n");
+               exit(1);
+       }
+
+       if (args.nr_procs > MAX_SLEEP_PROCS) {
+               fprintf(stderr, "benchmark supports up to %u sleeper procs!\n",
+                       MAX_SLEEP_PROCS);
+               exit(1);
+       }
+}
+
+static long kthread_pid_ticks(void)
+{
+       char procfs_path[100];
+       long stime;
+       FILE *f;
+
+       if (!args.kthread_pid)
+               return -1;
+
+       sprintf(procfs_path, "/proc/%u/stat", args.kthread_pid);
+       f = fopen(procfs_path, "r");
+       if (!f) {
+               fprintf(stderr, "couldn't open %s, exiting\n", procfs_path);
+               goto err_out;
+       }
+       if (fscanf(f, "%*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %ld", &stime) != 1) {
+               fprintf(stderr, "fscanf of %s failed, exiting\n", procfs_path);
+               goto err_out;
+       }
+       fclose(f);
+       return stime;
+
+err_out:
+       if (f)
+               fclose(f);
+       exit(1);
+       return 0;
+}
+
+static struct {
+       struct local_storage_rcu_tasks_trace_bench *skel;
+       long prev_kthread_stime;
+} ctx;
+
+static void sleep_and_loop(void)
+{
+       while (true) {
+               sleep(rand() % 4);
+               syscall(__NR_getpgid);
+       }
+}
+
+static void local_storage_tasks_trace_setup(void)
+{
+       int i, err, forkret, runner_pid;
+
+       runner_pid = getpid();
+
+       for (i = 0; i < args.nr_procs; i++) {
+               forkret = fork();
+               if (forkret < 0) {
+                       fprintf(stderr, "Error forking sleeper proc %u of %u, exiting\n", i,
+                               args.nr_procs);
+                       goto err_out;
+               }
+
+               if (!forkret) {
+                       err = prctl(PR_SET_PDEATHSIG, SIGKILL);
+                       if (err < 0) {
+                               fprintf(stderr, "prctl failed with err %d, exiting\n", errno);
+                               goto err_out;
+                       }
+
+                       if (getppid() != runner_pid) {
+                               fprintf(stderr, "Runner died while spinning up procs, exiting\n");
+                               goto err_out;
+                       }
+                       sleep_and_loop();
+               }
+       }
+       printf("Spun up %u procs (our pid %d)\n", args.nr_procs, runner_pid);
+
+       setup_libbpf();
+
+       ctx.skel = local_storage_rcu_tasks_trace_bench__open_and_load();
+       if (!ctx.skel) {
+               fprintf(stderr, "Error doing open_and_load, exiting\n");
+               goto err_out;
+       }
+
+       ctx.prev_kthread_stime = kthread_pid_ticks();
+
+       if (!bpf_program__attach(ctx.skel->progs.get_local)) {
+               fprintf(stderr, "Error attaching bpf program\n");
+               goto err_out;
+       }
+
+       if (!bpf_program__attach(ctx.skel->progs.pregp_step)) {
+               fprintf(stderr, "Error attaching bpf program\n");
+               goto err_out;
+       }
+
+       if (!bpf_program__attach(ctx.skel->progs.postgp)) {
+               fprintf(stderr, "Error attaching bpf program\n");
+               goto err_out;
+       }
+
+       return;
+err_out:
+       exit(1);
+}
+
+static void measure(struct bench_res *res)
+{
+       long ticks;
+
+       res->gp_ct = atomic_swap(&ctx.skel->bss->gp_hits, 0);
+       res->gp_ns = atomic_swap(&ctx.skel->bss->gp_times, 0);
+       ticks = kthread_pid_ticks();
+       res->stime = ticks - ctx.prev_kthread_stime;
+       ctx.prev_kthread_stime = ticks;
+}
+
+static void *consumer(void *input)
+{
+       return NULL;
+}
+
+static void *producer(void *input)
+{
+       while (true)
+               syscall(__NR_getpgid);
+       return NULL;
+}
+
+static void report_progress(int iter, struct bench_res *res, long delta_ns)
+{
+       if (ctx.skel->bss->unexpected) {
+               fprintf(stderr, "Error: Unexpected order of bpf prog calls (postgp after pregp).");
+               fprintf(stderr, "Data can't be trusted, exiting\n");
+               exit(1);
+       }
+
+       if (args.quiet)
+               return;
+
+       printf("Iter %d\t avg tasks_trace grace period latency\t%lf ns\n",
+              iter, res->gp_ns / (double)res->gp_ct);
+       printf("Iter %d\t avg ticks per tasks_trace grace period\t%lf\n",
+              iter, res->stime / (double)res->gp_ct);
+}
+
+static void report_final(struct bench_res res[], int res_cnt)
+{
+       struct basic_stats gp_stat;
+
+       grace_period_latency_basic_stats(res, res_cnt, &gp_stat);
+       printf("SUMMARY tasks_trace grace period latency");
+       printf("\tavg %.3lf us\tstddev %.3lf us\n", gp_stat.mean, gp_stat.stddev);
+       grace_period_ticks_basic_stats(res, res_cnt, &gp_stat);
+       printf("SUMMARY ticks per tasks_trace grace period");
+       printf("\tavg %.3lf\tstddev %.3lf\n", gp_stat.mean, gp_stat.stddev);
+}
+
+/* local-storage-tasks-trace: Benchmark performance of BPF local_storage's use
+ * of RCU Tasks-Trace.
+ *
+ * Stress RCU Tasks Trace by forking many tasks, all of which do no work aside
+ * from sleep() loop, and creating/destroying BPF task-local storage on wakeup.
+ * The number of forked tasks is configurable.
+ *
+ * exercising code paths which call call_rcu_tasks_trace while there are many
+ * thousands of tasks on the system should result in RCU Tasks-Trace having to
+ * do a noticeable amount of work.
+ *
+ * This should be observable by measuring rcu_tasks_trace_kthread CPU usage
+ * after the grace period has ended, or by measuring grace period latency.
+ *
+ * This benchmark uses both approaches, attaching to rcu_tasks_trace_pregp_step
+ * and rcu_tasks_trace_postgp functions to measure grace period latency and
+ * using /proc/PID/stat to measure rcu_tasks_trace_kthread kernel ticks
+ */
+const struct bench bench_local_storage_tasks_trace = {
+       .name = "local-storage-tasks-trace",
+       .validate = validate,
+       .setup = local_storage_tasks_trace_setup,
+       .producer_thread = producer,
+       .consumer_thread = consumer,
+       .measure = measure,
+       .report_progress = report_progress,
+       .report_final = report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_local_storage_rcu_tasks_trace.sh b/tools/testing/selftests/bpf/benchs/run_bench_local_storage_rcu_tasks_trace.sh
new file mode 100755 (executable)
index 0000000..5dac1f0
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+kthread_pid=`pgrep rcu_tasks_trace_kthread`
+
+if [ -z $kthread_pid ]; then
+       echo "error: Couldn't find rcu_tasks_trace_kthread"
+       exit 1
+fi
+
+./bench --nr_procs 15000 --kthread_pid $kthread_pid -d 600 --quiet 1 local-storage-tasks-trace
diff --git a/tools/testing/selftests/bpf/progs/local_storage_rcu_tasks_trace_bench.c b/tools/testing/selftests/bpf/progs/local_storage_rcu_tasks_trace_bench.c
new file mode 100644 (file)
index 0000000..03bf69f
--- /dev/null
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+struct {
+       __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+       __uint(map_flags, BPF_F_NO_PREALLOC);
+       __type(key, int);
+       __type(value, int);
+} task_storage SEC(".maps");
+
+long hits;
+long gp_hits;
+long gp_times;
+long current_gp_start;
+long unexpected;
+bool postgp_seen;
+
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
+int get_local(void *ctx)
+{
+       struct task_struct *task;
+       int idx;
+       int *s;
+
+       idx = 0;
+       task = bpf_get_current_task_btf();
+       s = bpf_task_storage_get(&task_storage, task, &idx,
+                                BPF_LOCAL_STORAGE_GET_F_CREATE);
+       if (!s)
+               return 0;
+
+       *s = 3;
+       bpf_task_storage_delete(&task_storage, task);
+       __sync_add_and_fetch(&hits, 1);
+       return 0;
+}
+
+SEC("fentry/rcu_tasks_trace_pregp_step")
+int pregp_step(struct pt_regs *ctx)
+{
+       current_gp_start = bpf_ktime_get_ns();
+       return 0;
+}
+
+SEC("fentry/rcu_tasks_trace_postgp")
+int postgp(struct pt_regs *ctx)
+{
+       if (!current_gp_start && postgp_seen) {
+               /* Will only happen if prog tracing rcu_tasks_trace_pregp_step doesn't
+                * execute before this prog
+                */
+               __sync_add_and_fetch(&unexpected, 1);
+               return 0;
+       }
+
+       __sync_add_and_fetch(&gp_times, bpf_ktime_get_ns() - current_gp_start);
+       __sync_add_and_fetch(&gp_hits, 1);
+       current_gp_start = 0;
+       postgp_seen = true;
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";