1 // SPDX-License-Identifier: GPL-2.0
3 * Memory bandwidth monitoring and allocation library
5 * Copyright (C) 2018 Intel Corporation
8 * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
13 #define UNCORE_IMC "uncore_imc"
14 #define READ_FILE_NAME "events/cas_count_read"
15 #define WRITE_FILE_NAME "events/cas_count_write"
16 #define DYN_PMU_PATH "/sys/bus/event_source/devices"
17 #define SCALE 0.00006103515625
22 #define CON_MON_MBM_LOCAL_BYTES_PATH \
23 "%s/%s/mon_groups/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
25 #define CON_MBM_LOCAL_BYTES_PATH \
26 "%s/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
28 #define MON_MBM_LOCAL_BYTES_PATH \
29 "%s/mon_groups/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
31 #define MBM_LOCAL_BYTES_PATH \
32 "%s/mon_data/mon_L3_%02d/mbm_local_bytes"
34 #define CON_MON_LCC_OCCUP_PATH \
35 "%s/%s/mon_groups/%s/mon_data/mon_L3_%02d/llc_occupancy"
37 #define CON_LCC_OCCUP_PATH \
38 "%s/%s/mon_data/mon_L3_%02d/llc_occupancy"
40 #define MON_LCC_OCCUP_PATH \
41 "%s/mon_groups/%s/mon_data/mon_L3_%02d/llc_occupancy"
43 #define LCC_OCCUP_PATH \
44 "%s/mon_data/mon_L3_%02d/llc_occupancy"
46 struct membw_read_format {
47 __u64 value; /* The value of the event */
48 __u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
49 __u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
50 __u64 id; /* if PERF_FORMAT_ID */
53 struct imc_counter_config {
57 struct perf_event_attr pe;
58 struct membw_read_format return_value;
62 static char mbm_total_path[1024];
64 static struct imc_counter_config imc_counters_config[MAX_IMCS][2];
66 void membw_initialize_perf_event_attr(int i, int j)
68 memset(&imc_counters_config[i][j].pe, 0,
69 sizeof(struct perf_event_attr));
70 imc_counters_config[i][j].pe.type = imc_counters_config[i][j].type;
71 imc_counters_config[i][j].pe.size = sizeof(struct perf_event_attr);
72 imc_counters_config[i][j].pe.disabled = 1;
73 imc_counters_config[i][j].pe.inherit = 1;
74 imc_counters_config[i][j].pe.exclude_guest = 0;
75 imc_counters_config[i][j].pe.config =
76 imc_counters_config[i][j].umask << 8 |
77 imc_counters_config[i][j].event;
78 imc_counters_config[i][j].pe.sample_type = PERF_SAMPLE_IDENTIFIER;
79 imc_counters_config[i][j].pe.read_format =
80 PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
83 void membw_ioctl_perf_event_ioc_reset_enable(int i, int j)
85 ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_RESET, 0);
86 ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_ENABLE, 0);
89 void membw_ioctl_perf_event_ioc_disable(int i, int j)
91 ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_DISABLE, 0);
95 * get_event_and_umask: Parse config into event and umask
96 * @cas_count_cfg: Config
98 * @op: Operation (read/write)
100 void get_event_and_umask(char *cas_count_cfg, int count, bool op)
102 char *token[MAX_TOKENS];
105 strcat(cas_count_cfg, ",");
106 token[0] = strtok(cas_count_cfg, "=,");
108 for (i = 1; i < MAX_TOKENS; i++)
109 token[i] = strtok(NULL, "=,");
111 for (i = 0; i < MAX_TOKENS; i++) {
114 if (strcmp(token[i], "event") == 0) {
116 imc_counters_config[count][READ].event =
117 strtol(token[i + 1], NULL, 16);
119 imc_counters_config[count][WRITE].event =
120 strtol(token[i + 1], NULL, 16);
122 if (strcmp(token[i], "umask") == 0) {
124 imc_counters_config[count][READ].umask =
125 strtol(token[i + 1], NULL, 16);
127 imc_counters_config[count][WRITE].umask =
128 strtol(token[i + 1], NULL, 16);
133 static int open_perf_event(int i, int cpu_no, int j)
135 imc_counters_config[i][j].fd =
136 perf_event_open(&imc_counters_config[i][j].pe, -1, cpu_no, -1,
137 PERF_FLAG_FD_CLOEXEC);
139 if (imc_counters_config[i][j].fd == -1) {
140 fprintf(stderr, "Error opening leader %llx\n",
141 imc_counters_config[i][j].pe.config);
149 /* Get type and config (read and write) of an iMC counter */
150 static int read_from_imc_dir(char *imc_dir, int count)
152 char cas_count_cfg[1024], imc_counter_cfg[1024], imc_counter_type[1024];
155 /* Get type of iMC counter */
156 sprintf(imc_counter_type, "%s%s", imc_dir, "type");
157 fp = fopen(imc_counter_type, "r");
159 perror("Failed to open imc counter type file");
163 if (fscanf(fp, "%u", &imc_counters_config[count][READ].type) <= 0) {
164 perror("Could not get imc type");
171 imc_counters_config[count][WRITE].type =
172 imc_counters_config[count][READ].type;
174 /* Get read config */
175 sprintf(imc_counter_cfg, "%s%s", imc_dir, READ_FILE_NAME);
176 fp = fopen(imc_counter_cfg, "r");
178 perror("Failed to open imc config file");
182 if (fscanf(fp, "%s", cas_count_cfg) <= 0) {
183 perror("Could not get imc cas count read");
190 get_event_and_umask(cas_count_cfg, count, READ);
192 /* Get write config */
193 sprintf(imc_counter_cfg, "%s%s", imc_dir, WRITE_FILE_NAME);
194 fp = fopen(imc_counter_cfg, "r");
196 perror("Failed to open imc config file");
200 if (fscanf(fp, "%s", cas_count_cfg) <= 0) {
201 perror("Could not get imc cas count write");
208 get_event_and_umask(cas_count_cfg, count, WRITE);
214 * A system can have 'n' number of iMC (Integrated Memory Controller)
215 * counters, get that 'n'. For each iMC counter get it's type and config.
216 * Also, each counter has two configs, one for read and the other for write.
217 * A config again has two parts, event and umask.
218 * Enumerate all these details into an array of structures.
220 * Return: >= 0 on success. < 0 on failure.
222 static int num_of_imcs(void)
224 unsigned int count = 0;
230 dp = opendir(DYN_PMU_PATH);
232 while ((ep = readdir(dp))) {
233 if (strstr(ep->d_name, UNCORE_IMC)) {
234 sprintf(imc_dir, "%s/%s/", DYN_PMU_PATH,
236 ret = read_from_imc_dir(imc_dir, count);
247 perror("Unable find iMC counters!\n");
252 perror("Unable to open PMU directory!\n");
260 static int initialize_mem_bw_imc(void)
264 imcs = num_of_imcs();
268 /* Initialize perf_event_attr structures for all iMC's */
269 for (imc = 0; imc < imcs; imc++) {
270 for (j = 0; j < 2; j++)
271 membw_initialize_perf_event_attr(imc, j);
278 * get_mem_bw_imc: Memory band width as reported by iMC counters
279 * @cpu_no: CPU number that the benchmark PID is binded to
280 * @bw_report: Bandwidth report type (reads, writes)
282 * Memory B/W utilized by a process on a socket can be calculated using
283 * iMC counters. Perf events are used to read these counters.
285 * Return: >= 0 on success. < 0 on failure.
287 static float get_mem_bw_imc(int cpu_no, char *bw_report)
289 float reads, writes, of_mul_read, of_mul_write;
292 /* Start all iMC counters to log values (both read and write) */
293 reads = 0, writes = 0, of_mul_read = 1, of_mul_write = 1;
294 for (imc = 0; imc < imcs; imc++) {
295 for (j = 0; j < 2; j++) {
296 ret = open_perf_event(imc, cpu_no, j);
300 for (j = 0; j < 2; j++)
301 membw_ioctl_perf_event_ioc_reset_enable(imc, j);
306 /* Stop counters after a second to get results (both read and write) */
307 for (imc = 0; imc < imcs; imc++) {
308 for (j = 0; j < 2; j++)
309 membw_ioctl_perf_event_ioc_disable(imc, j);
313 * Get results which are stored in struct type imc_counter_config
314 * Take over flow into consideration before calculating total b/w
316 for (imc = 0; imc < imcs; imc++) {
317 struct imc_counter_config *r =
318 &imc_counters_config[imc][READ];
319 struct imc_counter_config *w =
320 &imc_counters_config[imc][WRITE];
322 if (read(r->fd, &r->return_value,
323 sizeof(struct membw_read_format)) == -1) {
324 perror("Couldn't get read b/w through iMC");
329 if (read(w->fd, &w->return_value,
330 sizeof(struct membw_read_format)) == -1) {
331 perror("Couldn't get write bw through iMC");
336 __u64 r_time_enabled = r->return_value.time_enabled;
337 __u64 r_time_running = r->return_value.time_running;
339 if (r_time_enabled != r_time_running)
340 of_mul_read = (float)r_time_enabled /
341 (float)r_time_running;
343 __u64 w_time_enabled = w->return_value.time_enabled;
344 __u64 w_time_running = w->return_value.time_running;
346 if (w_time_enabled != w_time_running)
347 of_mul_write = (float)w_time_enabled /
348 (float)w_time_running;
349 reads += r->return_value.value * of_mul_read * SCALE;
350 writes += w->return_value.value * of_mul_write * SCALE;
353 for (imc = 0; imc < imcs; imc++) {
354 close(imc_counters_config[imc][READ].fd);
355 close(imc_counters_config[imc][WRITE].fd);
358 if (strcmp(bw_report, "reads") == 0)
361 if (strcmp(bw_report, "writes") == 0)
364 return (reads + writes);
367 void set_mbm_path(const char *ctrlgrp, const char *mongrp, int resource_id)
369 if (ctrlgrp && mongrp)
370 sprintf(mbm_total_path, CON_MON_MBM_LOCAL_BYTES_PATH,
371 RESCTRL_PATH, ctrlgrp, mongrp, resource_id);
372 else if (!ctrlgrp && mongrp)
373 sprintf(mbm_total_path, MON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
374 mongrp, resource_id);
375 else if (ctrlgrp && !mongrp)
376 sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
377 ctrlgrp, resource_id);
378 else if (!ctrlgrp && !mongrp)
379 sprintf(mbm_total_path, MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
384 * initialize_mem_bw_resctrl: Appropriately populate "mbm_total_path"
385 * @ctrlgrp: Name of the control monitor group (con_mon grp)
386 * @mongrp: Name of the monitor group (mon grp)
387 * @cpu_no: CPU number that the benchmark PID is binded to
388 * @resctrl_val: Resctrl feature (Eg: mbm, mba.. etc)
390 static void initialize_mem_bw_resctrl(const char *ctrlgrp, const char *mongrp,
391 int cpu_no, char *resctrl_val)
395 if (get_resource_id(cpu_no, &resource_id) < 0) {
396 perror("Could not get resource_id");
400 if (strcmp(resctrl_val, "mbm") == 0)
401 set_mbm_path(ctrlgrp, mongrp, resource_id);
403 if ((strcmp(resctrl_val, "mba") == 0)) {
405 sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH,
406 RESCTRL_PATH, ctrlgrp, resource_id);
408 sprintf(mbm_total_path, MBM_LOCAL_BYTES_PATH,
409 RESCTRL_PATH, resource_id);
414 * Get MBM Local bytes as reported by resctrl FS
416 * 1. If con_mon grp and mon grp are given, then read from con_mon grp's mon grp
417 * 2. If only con_mon grp is given, then read from con_mon grp
418 * 3. If both are not given, then read from root con_mon grp
420 * 1. If con_mon grp is given, then read from it
421 * 2. If con_mon grp is not given, then read from root con_mon grp
423 static unsigned long get_mem_bw_resctrl(void)
425 unsigned long mbm_total = 0;
428 fp = fopen(mbm_total_path, "r");
430 perror("Failed to open total bw file");
434 if (fscanf(fp, "%lu", &mbm_total) <= 0) {
435 perror("Could not get mbm local bytes");
447 void ctrlc_handler(int signum, siginfo_t *info, void *ptr)
449 kill(bm_pid, SIGKILL);
452 printf("Ending\n\n");
458 * print_results_bw: the memory bandwidth results are stored in a file
459 * @filename: file that stores the results
460 * @bm_pid: child pid that runs benchmark
461 * @bw_imc: perf imc counter value
462 * @bw_resc: memory bandwidth value
464 * Return: 0 on success. non-zero on failure.
466 static int print_results_bw(char *filename, int bm_pid, float bw_imc,
467 unsigned long bw_resc)
469 unsigned long diff = fabs(bw_imc - bw_resc);
472 if (strcmp(filename, "stdio") == 0 || strcmp(filename, "stderr") == 0) {
473 printf("Pid: %d \t Mem_BW_iMC: %f \t ", bm_pid, bw_imc);
474 printf("Mem_BW_resc: %lu \t Difference: %lu\n", bw_resc, diff);
476 fp = fopen(filename, "a");
478 perror("Cannot open results file");
482 if (fprintf(fp, "Pid: %d \t Mem_BW_iMC: %f \t Mem_BW_resc: %lu \t Difference: %lu\n",
483 bm_pid, bw_imc, bw_resc, diff) <= 0) {
485 perror("Could not log results.");
495 static void set_cqm_path(const char *ctrlgrp, const char *mongrp, char sock_num)
497 if (strlen(ctrlgrp) && strlen(mongrp))
498 sprintf(llc_occup_path, CON_MON_LCC_OCCUP_PATH, RESCTRL_PATH,
499 ctrlgrp, mongrp, sock_num);
500 else if (!strlen(ctrlgrp) && strlen(mongrp))
501 sprintf(llc_occup_path, MON_LCC_OCCUP_PATH, RESCTRL_PATH,
503 else if (strlen(ctrlgrp) && !strlen(mongrp))
504 sprintf(llc_occup_path, CON_LCC_OCCUP_PATH, RESCTRL_PATH,
506 else if (!strlen(ctrlgrp) && !strlen(mongrp))
507 sprintf(llc_occup_path, LCC_OCCUP_PATH, RESCTRL_PATH, sock_num);
511 * initialize_llc_occu_resctrl: Appropriately populate "llc_occup_path"
512 * @ctrlgrp: Name of the control monitor group (con_mon grp)
513 * @mongrp: Name of the monitor group (mon grp)
514 * @cpu_no: CPU number that the benchmark PID is binded to
515 * @resctrl_val: Resctrl feature (Eg: cat, cqm.. etc)
517 static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp,
518 int cpu_no, char *resctrl_val)
522 if (get_resource_id(cpu_no, &resource_id) < 0) {
523 perror("# Unable to resource_id");
527 if (strcmp(resctrl_val, "cqm") == 0)
528 set_cqm_path(ctrlgrp, mongrp, resource_id);
532 measure_vals(struct resctrl_val_param *param, unsigned long *bw_resc_start)
534 unsigned long bw_imc, bw_resc, bw_resc_end;
538 * Measure memory bandwidth from resctrl and from
539 * another source which is perf imc value or could
540 * be something else if perf imc event is not available.
541 * Compare the two values to validate resctrl value.
542 * It takes 1sec to measure the data.
544 bw_imc = get_mem_bw_imc(param->cpu_no, param->bw_report);
548 bw_resc_end = get_mem_bw_resctrl();
549 if (bw_resc_end <= 0)
552 bw_resc = (bw_resc_end - *bw_resc_start) / MB;
553 ret = print_results_bw(param->filename, bm_pid, bw_imc, bw_resc);
557 *bw_resc_start = bw_resc_end;
563 * resctrl_val: execute benchmark and measure memory bandwidth on
565 * @benchmark_cmd: benchmark command and its arguments
566 * @param: parameters passed to resctrl_val()
568 * Return: 0 on success. non-zero on failure.
570 int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
572 char *resctrl_val = param->resctrl_val;
573 unsigned long bw_resc_start = 0;
574 struct sigaction sigact;
575 int ret = 0, pipefd[2];
576 char pipe_message = 0;
579 if (strcmp(param->filename, "") == 0)
580 sprintf(param->filename, "stdio");
582 if ((strcmp(resctrl_val, "mba")) == 0 ||
583 (strcmp(resctrl_val, "mbm")) == 0) {
584 ret = validate_bw_report_request(param->bw_report);
589 ret = remount_resctrlfs(param->mum_resctrlfs);
594 * If benchmark wasn't successfully started by child, then child should
595 * kill parent, so save parent's pid
600 perror("# Unable to create pipe");
606 * Fork to start benchmark, save child's pid so that it can be killed
611 perror("# Unable to fork");
618 * Mask all signals except SIGUSR1, parent uses SIGUSR1 to
621 sigfillset(&sigact.sa_mask);
622 sigdelset(&sigact.sa_mask, SIGUSR1);
624 sigact.sa_sigaction = run_benchmark;
625 sigact.sa_flags = SA_SIGINFO;
627 /* Register for "SIGUSR1" signal from parent */
628 if (sigaction(SIGUSR1, &sigact, NULL))
629 PARENT_EXIT("Can't register child for signal");
631 /* Tell parent that child is ready */
634 if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) <
635 sizeof(pipe_message)) {
636 perror("# failed signaling parent process");
642 /* Suspend child until delivery of "SIGUSR1" from parent */
643 sigsuspend(&sigact.sa_mask);
645 PARENT_EXIT("Child is done");
648 printf("# benchmark PID: %d\n", bm_pid);
651 * Register CTRL-C handler for parent, as it has to kill benchmark
654 sigact.sa_sigaction = ctrlc_handler;
655 sigemptyset(&sigact.sa_mask);
656 sigact.sa_flags = SA_SIGINFO;
657 if (sigaction(SIGINT, &sigact, NULL) ||
658 sigaction(SIGHUP, &sigact, NULL)) {
659 perror("# sigaction");
664 value.sival_ptr = benchmark_cmd;
666 /* Taskset benchmark to specified cpu */
667 ret = taskset_benchmark(bm_pid, param->cpu_no);
671 /* Write benchmark to specified control&monitoring grp in resctrl FS */
672 ret = write_bm_pid_to_resctrl(bm_pid, param->ctrlgrp, param->mongrp,
677 if ((strcmp(resctrl_val, "mbm") == 0) ||
678 (strcmp(resctrl_val, "mba") == 0)) {
679 ret = initialize_mem_bw_imc();
683 initialize_mem_bw_resctrl(param->ctrlgrp, param->mongrp,
684 param->cpu_no, resctrl_val);
685 } else if (strcmp(resctrl_val, "cqm") == 0)
686 initialize_llc_occu_resctrl(param->ctrlgrp, param->mongrp,
687 param->cpu_no, resctrl_val);
689 /* Parent waits for child to be ready. */
691 while (pipe_message != 1) {
692 if (read(pipefd[0], &pipe_message, sizeof(pipe_message)) <
693 sizeof(pipe_message)) {
694 perror("# failed reading message from child process");
701 /* Signal child to start benchmark */
702 if (sigqueue(bm_pid, SIGUSR1, value) == -1) {
703 perror("# sigqueue SIGUSR1 to child");
708 /* Give benchmark enough time to fully run */
711 /* Test runs until the callback setup() tells the test to stop. */
713 if ((strcmp(resctrl_val, "mbm") == 0) ||
714 (strcmp(resctrl_val, "mba") == 0)) {
715 ret = param->setup(1, param);
721 ret = measure_vals(param, &bw_resc_start);
724 } else if (strcmp(resctrl_val, "cqm") == 0) {
725 ret = param->setup(1, param);
731 ret = measure_cache_vals(param, bm_pid);
740 kill(bm_pid, SIGKILL);