perf report: Add machine parallelism
authorDmitry Vyukov <dvyukov@google.com>
Thu, 13 Feb 2025 09:08:14 +0000 (10:08 +0100)
committerNamhyung Kim <namhyung@kernel.org>
Tue, 18 Feb 2025 06:00:50 +0000 (22:00 -0800)
Add calculation of the current parallelism level (number of threads actively
running on CPUs). The parallelism level can be shown in reports on its own,
and to calculate latency overheads.

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lore.kernel.org/r/0f8c1b8eb12619029e31b3d5c0346f4616a5aeda.1739437531.git.dvyukov@google.com
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
tools/perf/builtin-report.c
tools/perf/util/addr_location.c
tools/perf/util/addr_location.h
tools/perf/util/event.c
tools/perf/util/machine.c
tools/perf/util/machine.h

index f5fbd67..0d9bd09 100644 (file)
@@ -1568,6 +1568,7 @@ repeat:
        report.tool.cgroup               = perf_event__process_cgroup;
        report.tool.exit                 = perf_event__process_exit;
        report.tool.fork                 = perf_event__process_fork;
+       report.tool.context_switch       = perf_event__process_switch;
        report.tool.lost                 = perf_event__process_lost;
        report.tool.read                 = process_read_event;
        report.tool.attr                 = process_attr;
index 51825ef..007a2f5 100644 (file)
@@ -17,6 +17,7 @@ void addr_location__init(struct addr_location *al)
        al->cpumode = 0;
        al->cpu = 0;
        al->socket = 0;
+       al->parallelism = 1;
 }
 
 /*
index d8ac042..36aaa45 100644 (file)
@@ -21,6 +21,8 @@ struct addr_location {
        u8            cpumode;
        s32           cpu;
        s32           socket;
+       /* Same as machine.parallelism but within [1, nr_cpus]. */
+       int           parallelism;
 };
 
 void addr_location__init(struct addr_location *al);
index aac96d5..2f10e31 100644 (file)
@@ -767,6 +767,9 @@ int machine__resolve(struct machine *machine, struct addr_location *al,
                        al->socket = env->cpu[al->cpu].socket_id;
        }
 
+       /* Account for possible out-of-order switch events. */
+       al->parallelism = max(1, min(machine->parallelism, machine__nr_cpus_avail(machine)));
+
        if (al->map) {
                if (symbol_conf.dso_list &&
                    (!dso || !(strlist__has_entry(symbol_conf.dso_list,
index 55d4977..d96cbfd 100644 (file)
@@ -94,6 +94,8 @@ int machine__init(struct machine *machine, const char *root_dir, pid_t pid)
        machine->comm_exec = false;
        machine->kernel_start = 0;
        machine->vmlinux_map = NULL;
+       /* There is no initial context switch in, so we start at 1. */
+       machine->parallelism = 1;
 
        machine->root_dir = strdup(root_dir);
        if (machine->root_dir == NULL)
@@ -677,8 +679,11 @@ int machine__process_aux_output_hw_id_event(struct machine *machine __maybe_unus
 int machine__process_switch_event(struct machine *machine __maybe_unused,
                                  union perf_event *event)
 {
+       bool out = event->header.misc & PERF_RECORD_MISC_SWITCH_OUT;
+
        if (dump_trace)
                perf_event__fprintf_switch(event, stdout);
+       machine->parallelism += out ? -1 : 1;
        return 0;
 }
 
@@ -1880,6 +1885,8 @@ int machine__process_exit_event(struct machine *machine, union perf_event *event
        if (dump_trace)
                perf_event__fprintf_task(event, stdout);
 
+       /* There is no context switch out before exit, so we decrement here. */
+       machine->parallelism--;
        if (thread != NULL) {
                if (symbol_conf.keep_exited_threads)
                        thread__set_exited(thread, /*exited=*/true);
index ae3e554..b56abec 100644 (file)
@@ -50,6 +50,12 @@ struct machine {
                u64       text_start;
                u64       text_end;
        } sched, lock, traceiter, trace;
+       /*
+        * The current parallelism level (number of threads that run on CPUs).
+        * This value can be less than 1, or larger than the total number
+        * of CPUs, if events are poorly ordered.
+        */
+       int               parallelism;
        pid_t             *current_tid;
        size_t            current_tid_sz;
        union { /* Tool specific area */