arch/x86/events/intel/rapl.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Support Intel RAPL energy consumption counters
   4  * Copyright (C) 2013 Google, Inc., Stephane Eranian
   5  *
   6  * Intel RAPL interface is specified in the IA-32 Manual Vol3b
   7  * section 14.7.1 (September 2013)
   8  *
   9  * RAPL provides more controls than just reporting energy consumption
  10  * however here we only expose the 3 energy consumption free running
  11  * counters (pp0, pkg, dram).
  12  *
  13  * Each of those counters increments in a power unit defined by the
  14  * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
  15  * but it can vary.
  16  *
  17  * Counter to rapl events mappings:
  18  *
  19  *  pp0 counter: consumption of all physical cores (power plane 0)
  20  *        event: rapl_energy_cores
  21  *    perf code: 0x1
  22  *
  23  *  pkg counter: consumption of the whole processor package
  24  *        event: rapl_energy_pkg
  25  *    perf code: 0x2
  26  *
  27  * dram counter: consumption of the dram domain (servers only)
  28  *        event: rapl_energy_dram
  29  *    perf code: 0x3
  30  *
  31  * gpu counter: consumption of the builtin-gpu domain (client only)
  32  *        event: rapl_energy_gpu
  33  *    perf code: 0x4
  34  *
  35  *  psys counter: consumption of the builtin-psys domain (client only)
  36  *        event: rapl_energy_psys
  37  *    perf code: 0x5
  38  *
  39  * We manage those counters as free running (read-only). They may be
  40  * use simultaneously by other tools, such as turbostat.
  41  *
  42  * The events only support system-wide mode counting. There is no
  43  * sampling support because it does not make sense and is not
  44  * supported by the RAPL hardware.
  45  *
  46  * Because we want to avoid floating-point operations in the kernel,
  47  * the events are all reported in fixed point arithmetic (32.32).
  48  * Tools must adjust the counts to convert them to Watts using
  49  * the duration of the measurement. Tools may use a function such as
  50  * ldexp(raw_count, -32);
  51  */
  52
  53 #define pr_fmt(fmt) "RAPL PMU: " fmt
  54
  55 #include <linux/module.h>
  56 #include <linux/slab.h>
  57 #include <linux/perf_event.h>
  58 #include <asm/cpu_device_id.h>
  59 #include <asm/intel-family.h>
  60 #include "../perf_event.h"
  61 #include "../probe.h"
  62
  63 MODULE_LICENSE("GPL");
  64
  65 /*
  66  * RAPL energy status counters
  67  */
  68 #define RAPL_IDX_PP0_NRG_STAT   0       /* all cores */
  69 #define INTEL_RAPL_PP0          0x1     /* pseudo-encoding */
  70 #define RAPL_IDX_PKG_NRG_STAT   1       /* entire package */
  71 #define INTEL_RAPL_PKG          0x2     /* pseudo-encoding */
  72 #define RAPL_IDX_RAM_NRG_STAT   2       /* DRAM */
  73 #define INTEL_RAPL_RAM          0x3     /* pseudo-encoding */
  74 #define RAPL_IDX_PP1_NRG_STAT   3       /* gpu */
  75 #define INTEL_RAPL_PP1          0x4     /* pseudo-encoding */
  76 #define RAPL_IDX_PSYS_NRG_STAT  4       /* psys */
  77 #define INTEL_RAPL_PSYS         0x5     /* pseudo-encoding */
  78
  79 #define NR_RAPL_DOMAINS         0x5
  80
  81 enum perf_rapl_events {
  82         PERF_RAPL_PP0 = 0,              /* all cores */
  83         PERF_RAPL_PKG,                  /* entire package */
  84         PERF_RAPL_RAM,                  /* DRAM */
  85         PERF_RAPL_PP1,                  /* gpu */
  86         PERF_RAPL_PSYS,                 /* psys */
  87
  88         PERF_RAPL_MAX,
  89 };
  90
  91 static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
  92         "pp0-core",
  93         "package",
  94         "dram",
  95         "pp1-gpu",
  96         "psys",
  97 };
  98
  99 /* Clients have PP0, PKG */
 100 #define RAPL_IDX_CLN    (1<<RAPL_IDX_PP0_NRG_STAT|\
 101                          1<<RAPL_IDX_PKG_NRG_STAT|\
 102                          1<<RAPL_IDX_PP1_NRG_STAT)
 103
 104 /* Servers have PP0, PKG, RAM */
 105 #define RAPL_IDX_SRV    (1<<RAPL_IDX_PP0_NRG_STAT|\
 106                          1<<RAPL_IDX_PKG_NRG_STAT|\
 107                          1<<RAPL_IDX_RAM_NRG_STAT)
 108
 109 /* Servers have PP0, PKG, RAM, PP1 */
 110 #define RAPL_IDX_HSW    (1<<RAPL_IDX_PP0_NRG_STAT|\
 111                          1<<RAPL_IDX_PKG_NRG_STAT|\
 112                          1<<RAPL_IDX_RAM_NRG_STAT|\
 113                          1<<RAPL_IDX_PP1_NRG_STAT)
 114
 115 /* SKL clients have PP0, PKG, RAM, PP1, PSYS */
 116 #define RAPL_IDX_SKL_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
 117                           1<<RAPL_IDX_PKG_NRG_STAT|\
 118                           1<<RAPL_IDX_RAM_NRG_STAT|\
 119                           1<<RAPL_IDX_PP1_NRG_STAT|\
 120                           1<<RAPL_IDX_PSYS_NRG_STAT)
 121
 122 /* Knights Landing has PKG, RAM */
 123 #define RAPL_IDX_KNL    (1<<RAPL_IDX_PKG_NRG_STAT|\
 124                          1<<RAPL_IDX_RAM_NRG_STAT)
 125
 126 /*
 127  * event code: LSB 8 bits, passed in attr->config
 128  * any other bit is reserved
 129  */
 130 #define RAPL_EVENT_MASK 0xFFULL
 131
 132 #define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)           \
 133 static ssize_t __rapl_##_var##_show(struct kobject *kobj,       \
 134                                 struct kobj_attribute *attr,    \
 135                                 char *page)                     \
 136 {                                                               \
 137         BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
 138         return sprintf(page, _format "\n");                     \
 139 }                                                               \
 140 static struct kobj_attribute format_attr_##_var =               \
 141         __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
 142
 143 #define RAPL_CNTR_WIDTH 32
 144
 145 #define RAPL_EVENT_ATTR_STR(_name, v, str)                                      \
 146 static struct perf_pmu_events_attr event_attr_##v = {                           \
 147         .attr           = __ATTR(_name, 0444, perf_event_sysfs_show, NULL),     \
 148         .id             = 0,                                                    \
 149         .event_str      = str,                                                  \
 150 };
 151
 152 struct rapl_pmu {
 153         raw_spinlock_t          lock;
 154         int                     n_active;
 155         int                     cpu;
 156         struct list_head        active_list;
 157         struct pmu              *pmu;
 158         ktime_t                 timer_interval;
 159         struct hrtimer          hrtimer;
 160 };
 161
 162 struct rapl_pmus {
 163         struct pmu              pmu;
 164         unsigned int            maxdie;
 165         struct rapl_pmu         *pmus[];
 166 };
 167
 168 struct rapl_model {
 169         unsigned long   events;
 170         bool            apply_quirk;
 171 };
 172
 173  /* 1/2^hw_unit Joule */
 174 static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
 175 static struct rapl_pmus *rapl_pmus;
 176 static cpumask_t rapl_cpu_mask;
 177 static unsigned int rapl_cntr_mask;
 178 static u64 rapl_timer_ms;
 179
 180 static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
 181 {
 182         unsigned int dieid = topology_logical_die_id(cpu);
 183
 184         /*
 185          * The unsigned check also catches the '-1' return value for non
 186          * existent mappings in the topology map.
 187          */
 188         return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL;
 189 }
 190
 191 static inline u64 rapl_read_counter(struct perf_event *event)
 192 {
 193         u64 raw;
 194         rdmsrl(event->hw.event_base, raw);
 195         return raw;
 196 }
 197
 198 static inline u64 rapl_scale(u64 v, int cfg)
 199 {
 200         if (cfg > NR_RAPL_DOMAINS) {
 201                 pr_warn("Invalid domain %d, failed to scale data\n", cfg);
 202                 return v;
 203         }
 204         /*
 205          * scale delta to smallest unit (1/2^32)
 206          * users must then scale back: count * 1/(1e9*2^32) to get Joules
 207          * or use ldexp(count, -32).
 208          * Watts = Joules/Time delta
 209          */
 210         return v << (32 - rapl_hw_unit[cfg - 1]);
 211 }
 212
 213 static u64 rapl_event_update(struct perf_event *event)
 214 {
 215         struct hw_perf_event *hwc = &event->hw;
 216         u64 prev_raw_count, new_raw_count;
 217         s64 delta, sdelta;
 218         int shift = RAPL_CNTR_WIDTH;
 219
 220 again:
 221         prev_raw_count = local64_read(&hwc->prev_count);
 222         rdmsrl(event->hw.event_base, new_raw_count);
 223
 224         if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 225                             new_raw_count) != prev_raw_count) {
 226                 cpu_relax();
 227                 goto again;
 228         }
 229
 230         /*
 231          * Now we have the new raw value and have updated the prev
 232          * timestamp already. We can now calculate the elapsed delta
 233          * (event-)time and add that to the generic event.
 234          *
 235          * Careful, not all hw sign-extends above the physical width
 236          * of the count.
 237          */
 238         delta = (new_raw_count << shift) - (prev_raw_count << shift);
 239         delta >>= shift;
 240
 241         sdelta = rapl_scale(delta, event->hw.config);
 242
 243         local64_add(sdelta, &event->count);
 244
 245         return new_raw_count;
 246 }
 247
 248 static void rapl_start_hrtimer(struct rapl_pmu *pmu)
 249 {
 250        hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
 251                      HRTIMER_MODE_REL_PINNED);
 252 }
 253
 254 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
 255 {
 256         struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
 257         struct perf_event *event;
 258         unsigned long flags;
 259
 260         if (!pmu->n_active)
 261                 return HRTIMER_NORESTART;
 262
 263         raw_spin_lock_irqsave(&pmu->lock, flags);
 264
 265         list_for_each_entry(event, &pmu->active_list, active_entry)
 266                 rapl_event_update(event);
 267
 268         raw_spin_unlock_irqrestore(&pmu->lock, flags);
 269
 270         hrtimer_forward_now(hrtimer, pmu->timer_interval);
 271
 272         return HRTIMER_RESTART;
 273 }
 274
 275 static void rapl_hrtimer_init(struct rapl_pmu *pmu)
 276 {
 277         struct hrtimer *hr = &pmu->hrtimer;
 278
 279         hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 280         hr->function = rapl_hrtimer_handle;
 281 }
 282
 283 static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
 284                                    struct perf_event *event)
 285 {
 286         if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
 287                 return;
 288
 289         event->hw.state = 0;
 290
 291         list_add_tail(&event->active_entry, &pmu->active_list);
 292
 293         local64_set(&event->hw.prev_count, rapl_read_counter(event));
 294
 295         pmu->n_active++;
 296         if (pmu->n_active == 1)
 297                 rapl_start_hrtimer(pmu);
 298 }
 299
 300 static void rapl_pmu_event_start(struct perf_event *event, int mode)
 301 {
 302         struct rapl_pmu *pmu = event->pmu_private;
 303         unsigned long flags;
 304
 305         raw_spin_lock_irqsave(&pmu->lock, flags);
 306         __rapl_pmu_event_start(pmu, event);
 307         raw_spin_unlock_irqrestore(&pmu->lock, flags);
 308 }
 309
 310 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 311 {
 312         struct rapl_pmu *pmu = event->pmu_private;
 313         struct hw_perf_event *hwc = &event->hw;
 314         unsigned long flags;
 315
 316         raw_spin_lock_irqsave(&pmu->lock, flags);
 317
 318         /* mark event as deactivated and stopped */
 319         if (!(hwc->state & PERF_HES_STOPPED)) {
 320                 WARN_ON_ONCE(pmu->n_active <= 0);
 321                 pmu->n_active--;
 322                 if (pmu->n_active == 0)
 323                         hrtimer_cancel(&pmu->hrtimer);
 324
 325                 list_del(&event->active_entry);
 326
 327                 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
 328                 hwc->state |= PERF_HES_STOPPED;
 329         }
 330
 331         /* check if update of sw counter is necessary */
 332         if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
 333                 /*
 334                  * Drain the remaining delta count out of a event
 335                  * that we are disabling:
 336                  */
 337                 rapl_event_update(event);
 338                 hwc->state |= PERF_HES_UPTODATE;
 339         }
 340
 341         raw_spin_unlock_irqrestore(&pmu->lock, flags);
 342 }
 343
 344 static int rapl_pmu_event_add(struct perf_event *event, int mode)
 345 {
 346         struct rapl_pmu *pmu = event->pmu_private;
 347         struct hw_perf_event *hwc = &event->hw;
 348         unsigned long flags;
 349
 350         raw_spin_lock_irqsave(&pmu->lock, flags);
 351
 352         hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 353
 354         if (mode & PERF_EF_START)
 355                 __rapl_pmu_event_start(pmu, event);
 356
 357         raw_spin_unlock_irqrestore(&pmu->lock, flags);
 358
 359         return 0;
 360 }
 361
 362 static void rapl_pmu_event_del(struct perf_event *event, int flags)
 363 {
 364         rapl_pmu_event_stop(event, PERF_EF_UPDATE);
 365 }
 366
 367 static int rapl_pmu_event_init(struct perf_event *event)
 368 {
 369         u64 cfg = event->attr.config & RAPL_EVENT_MASK;
 370         int bit, msr, ret = 0;
 371         struct rapl_pmu *pmu;
 372
 373         /* only look at RAPL events */
 374         if (event->attr.type != rapl_pmus->pmu.type)
 375                 return -ENOENT;
 376
 377         /* check only supported bits are set */
 378         if (event->attr.config & ~RAPL_EVENT_MASK)
 379                 return -EINVAL;
 380
 381         if (event->cpu < 0)
 382                 return -EINVAL;
 383
 384         event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
 385
 386         /*
 387          * check event is known (determines counter)
 388          */
 389         switch (cfg) {
 390         case INTEL_RAPL_PP0:
 391                 bit = RAPL_IDX_PP0_NRG_STAT;
 392                 msr = MSR_PP0_ENERGY_STATUS;
 393                 break;
 394         case INTEL_RAPL_PKG:
 395                 bit = RAPL_IDX_PKG_NRG_STAT;
 396                 msr = MSR_PKG_ENERGY_STATUS;
 397                 break;
 398         case INTEL_RAPL_RAM:
 399                 bit = RAPL_IDX_RAM_NRG_STAT;
 400                 msr = MSR_DRAM_ENERGY_STATUS;
 401                 break;
 402         case INTEL_RAPL_PP1:
 403                 bit = RAPL_IDX_PP1_NRG_STAT;
 404                 msr = MSR_PP1_ENERGY_STATUS;
 405                 break;
 406         case INTEL_RAPL_PSYS:
 407                 bit = RAPL_IDX_PSYS_NRG_STAT;
 408                 msr = MSR_PLATFORM_ENERGY_STATUS;
 409                 break;
 410         default:
 411                 return -EINVAL;
 412         }
 413         /* check event supported */
 414         if (!(rapl_cntr_mask & (1 << bit)))
 415                 return -EINVAL;
 416
 417         /* unsupported modes and filters */
 418         if (event->attr.sample_period) /* no sampling */
 419                 return -EINVAL;
 420
 421         /* must be done before validate_group */
 422         pmu = cpu_to_rapl_pmu(event->cpu);
 423         if (!pmu)
 424                 return -EINVAL;
 425         event->cpu = pmu->cpu;
 426         event->pmu_private = pmu;
 427         event->hw.event_base = msr;
 428         event->hw.config = cfg;
 429         event->hw.idx = bit;
 430
 431         return ret;
 432 }
 433
 434 static void rapl_pmu_event_read(struct perf_event *event)
 435 {
 436         rapl_event_update(event);
 437 }
 438
 439 static ssize_t rapl_get_attr_cpumask(struct device *dev,
 440                                 struct device_attribute *attr, char *buf)
 441 {
 442         return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
 443 }
 444
 445 static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
 446
 447 static struct attribute *rapl_pmu_attrs[] = {
 448         &dev_attr_cpumask.attr,
 449         NULL,
 450 };
 451
 452 static struct attribute_group rapl_pmu_attr_group = {
 453         .attrs = rapl_pmu_attrs,
 454 };
 455
 456 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
 457 RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
 458 RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
 459 RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
 460 RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
 461
 462 RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
 463 RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
 464 RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
 465 RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
 466 RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
 467
 468 /*
 469  * we compute in 0.23 nJ increments regardless of MSR
 470  */
 471 RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
 472 RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
 473 RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
 474 RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
 475 RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
 476
 477 static struct attribute *rapl_events_srv_attr[] = {
 478         EVENT_PTR(rapl_cores),
 479         EVENT_PTR(rapl_pkg),
 480         EVENT_PTR(rapl_ram),
 481
 482         EVENT_PTR(rapl_cores_unit),
 483         EVENT_PTR(rapl_pkg_unit),
 484         EVENT_PTR(rapl_ram_unit),
 485
 486         EVENT_PTR(rapl_cores_scale),
 487         EVENT_PTR(rapl_pkg_scale),
 488         EVENT_PTR(rapl_ram_scale),
 489         NULL,
 490 };
 491
 492 static struct attribute *rapl_events_cln_attr[] = {
 493         EVENT_PTR(rapl_cores),
 494         EVENT_PTR(rapl_pkg),
 495         EVENT_PTR(rapl_gpu),
 496
 497         EVENT_PTR(rapl_cores_unit),
 498         EVENT_PTR(rapl_pkg_unit),
 499         EVENT_PTR(rapl_gpu_unit),
 500
 501         EVENT_PTR(rapl_cores_scale),
 502         EVENT_PTR(rapl_pkg_scale),
 503         EVENT_PTR(rapl_gpu_scale),
 504         NULL,
 505 };
 506
 507 static struct attribute *rapl_events_hsw_attr[] = {
 508         EVENT_PTR(rapl_cores),
 509         EVENT_PTR(rapl_pkg),
 510         EVENT_PTR(rapl_gpu),
 511         EVENT_PTR(rapl_ram),
 512
 513         EVENT_PTR(rapl_cores_unit),
 514         EVENT_PTR(rapl_pkg_unit),
 515         EVENT_PTR(rapl_gpu_unit),
 516         EVENT_PTR(rapl_ram_unit),
 517
 518         EVENT_PTR(rapl_cores_scale),
 519         EVENT_PTR(rapl_pkg_scale),
 520         EVENT_PTR(rapl_gpu_scale),
 521         EVENT_PTR(rapl_ram_scale),
 522         NULL,
 523 };
 524
 525 static struct attribute *rapl_events_skl_attr[] = {
 526         EVENT_PTR(rapl_cores),
 527         EVENT_PTR(rapl_pkg),
 528         EVENT_PTR(rapl_gpu),
 529         EVENT_PTR(rapl_ram),
 530         EVENT_PTR(rapl_psys),
 531
 532         EVENT_PTR(rapl_cores_unit),
 533         EVENT_PTR(rapl_pkg_unit),
 534         EVENT_PTR(rapl_gpu_unit),
 535         EVENT_PTR(rapl_ram_unit),
 536         EVENT_PTR(rapl_psys_unit),
 537
 538         EVENT_PTR(rapl_cores_scale),
 539         EVENT_PTR(rapl_pkg_scale),
 540         EVENT_PTR(rapl_gpu_scale),
 541         EVENT_PTR(rapl_ram_scale),
 542         EVENT_PTR(rapl_psys_scale),
 543         NULL,
 544 };
 545
 546 static struct attribute *rapl_events_knl_attr[] = {
 547         EVENT_PTR(rapl_pkg),
 548         EVENT_PTR(rapl_ram),
 549
 550         EVENT_PTR(rapl_pkg_unit),
 551         EVENT_PTR(rapl_ram_unit),
 552
 553         EVENT_PTR(rapl_pkg_scale),
 554         EVENT_PTR(rapl_ram_scale),
 555         NULL,
 556 };
 557
 558 /*
 559  * There are no default events, but we need to create
 560  * "events" group (with empty attrs) before updating
 561  * it with detected events.
 562  */
 563 static struct attribute *attrs_empty[] = {
 564         NULL,
 565 };
 566
 567 static struct attribute_group rapl_pmu_events_group = {
 568         .name = "events",
 569         .attrs = attrs_empty,
 570 };
 571
 572 DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
 573 static struct attribute *rapl_formats_attr[] = {
 574         &format_attr_event.attr,
 575         NULL,
 576 };
 577
 578 static struct attribute_group rapl_pmu_format_group = {
 579         .name = "format",
 580         .attrs = rapl_formats_attr,
 581 };
 582
 583 static const struct attribute_group *rapl_attr_groups[] = {
 584         &rapl_pmu_attr_group,
 585         &rapl_pmu_format_group,
 586         &rapl_pmu_events_group,
 587         NULL,
 588 };
 589
 590 static struct attribute *rapl_events_cores[] = {
 591         EVENT_PTR(rapl_cores),
 592         EVENT_PTR(rapl_cores_unit),
 593         EVENT_PTR(rapl_cores_scale),
 594         NULL,
 595 };
 596
 597 static struct attribute_group rapl_events_cores_group = {
 598         .name  = "events",
 599         .attrs = rapl_events_cores,
 600 };
 601
 602 static struct attribute *rapl_events_pkg[] = {
 603         EVENT_PTR(rapl_pkg),
 604         EVENT_PTR(rapl_pkg_unit),
 605         EVENT_PTR(rapl_pkg_scale),
 606         NULL,
 607 };
 608
 609 static struct attribute_group rapl_events_pkg_group = {
 610         .name  = "events",
 611         .attrs = rapl_events_pkg,
 612 };
 613
 614 static struct attribute *rapl_events_ram[] = {
 615         EVENT_PTR(rapl_ram),
 616         EVENT_PTR(rapl_ram_unit),
 617         EVENT_PTR(rapl_ram_scale),
 618         NULL,
 619 };
 620
 621 static struct attribute_group rapl_events_ram_group = {
 622         .name  = "events",
 623         .attrs = rapl_events_ram,
 624 };
 625
 626 static struct attribute *rapl_events_gpu[] = {
 627         EVENT_PTR(rapl_gpu),
 628         EVENT_PTR(rapl_gpu_unit),
 629         EVENT_PTR(rapl_gpu_scale),
 630         NULL,
 631 };
 632
 633 static struct attribute_group rapl_events_gpu_group = {
 634         .name  = "events",
 635         .attrs = rapl_events_gpu,
 636 };
 637
 638 static struct attribute *rapl_events_psys[] = {
 639         EVENT_PTR(rapl_psys),
 640         EVENT_PTR(rapl_psys_unit),
 641         EVENT_PTR(rapl_psys_scale),
 642         NULL,
 643 };
 644
 645 static struct attribute_group rapl_events_psys_group = {
 646         .name  = "events",
 647         .attrs = rapl_events_psys,
 648 };
 649
 650 static bool test_msr(int idx, void *data)
 651 {
 652         return test_bit(idx, (unsigned long *) data);
 653 }
 654
 655 static struct perf_msr rapl_msrs[] = {
 656         [PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr },
 657         [PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr },
 658         [PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr },
 659         [PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr },
 660         [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr },
 661 };
 662
 663 static int rapl_cpu_offline(unsigned int cpu)
 664 {
 665         struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
 666         int target;
 667
 668         /* Check if exiting cpu is used for collecting rapl events */
 669         if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
 670                 return 0;
 671
 672         pmu->cpu = -1;
 673         /* Find a new cpu to collect rapl events */
 674         target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
 675
 676         /* Migrate rapl events to the new target */
 677         if (target < nr_cpu_ids) {
 678                 cpumask_set_cpu(target, &rapl_cpu_mask);
 679                 pmu->cpu = target;
 680                 perf_pmu_migrate_context(pmu->pmu, cpu, target);
 681         }
 682         return 0;
 683 }
 684
 685 static int rapl_cpu_online(unsigned int cpu)
 686 {
 687         struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
 688         int target;
 689
 690         if (!pmu) {
 691                 pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
 692                 if (!pmu)
 693                         return -ENOMEM;
 694
 695                 raw_spin_lock_init(&pmu->lock);
 696                 INIT_LIST_HEAD(&pmu->active_list);
 697                 pmu->pmu = &rapl_pmus->pmu;
 698                 pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
 699                 rapl_hrtimer_init(pmu);
 700
 701                 rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
 702         }
 703
 704         /*
 705          * Check if there is an online cpu in the package which collects rapl
 706          * events already.
 707          */
 708         target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
 709         if (target < nr_cpu_ids)
 710                 return 0;
 711
 712         cpumask_set_cpu(cpu, &rapl_cpu_mask);
 713         pmu->cpu = cpu;
 714         return 0;
 715 }
 716
 717 static int rapl_check_hw_unit(bool apply_quirk)
 718 {
 719         u64 msr_rapl_power_unit_bits;
 720         int i;
 721
 722         /* protect rdmsrl() to handle virtualization */
 723         if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
 724                 return -1;
 725         for (i = 0; i < NR_RAPL_DOMAINS; i++)
 726                 rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
 727
 728         /*
 729          * DRAM domain on HSW server and KNL has fixed energy unit which can be
 730          * different than the unit from power unit MSR. See
 731          * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
 732          * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
 733          */
 734         if (apply_quirk)
 735                 rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
 736
 737         /*
 738          * Calculate the timer rate:
 739          * Use reference of 200W for scaling the timeout to avoid counter
 740          * overflows. 200W = 200 Joules/sec
 741          * Divide interval by 2 to avoid lockstep (2 * 100)
 742          * if hw unit is 32, then we use 2 ms 1/200/2
 743          */
 744         rapl_timer_ms = 2;
 745         if (rapl_hw_unit[0] < 32) {
 746                 rapl_timer_ms = (1000 / (2 * 100));
 747                 rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
 748         }
 749         return 0;
 750 }
 751
 752 static void __init rapl_advertise(void)
 753 {
 754         int i;
 755
 756         pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
 757                 hweight32(rapl_cntr_mask), rapl_timer_ms);
 758
 759         for (i = 0; i < NR_RAPL_DOMAINS; i++) {
 760                 if (rapl_cntr_mask & (1 << i)) {
 761                         pr_info("hw unit of domain %s 2^-%d Joules\n",
 762                                 rapl_domain_names[i], rapl_hw_unit[i]);
 763                 }
 764         }
 765 }
 766
 767 static void cleanup_rapl_pmus(void)
 768 {
 769         int i;
 770
 771         for (i = 0; i < rapl_pmus->maxdie; i++)
 772                 kfree(rapl_pmus->pmus[i]);
 773         kfree(rapl_pmus);
 774 }
 775
 776 const struct attribute_group *rapl_attr_update[] = {
 777         &rapl_events_cores_group,
 778         &rapl_events_pkg_group,
 779         &rapl_events_ram_group,
 780         &rapl_events_gpu_group,
 781         &rapl_events_gpu_group,
 782         NULL,
 783 };
 784
 785 static int __init init_rapl_pmus(void)
 786 {
 787         int maxdie = topology_max_packages() * topology_max_die_per_package();
 788         size_t size;
 789
 790         size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *);
 791         rapl_pmus = kzalloc(size, GFP_KERNEL);
 792         if (!rapl_pmus)
 793                 return -ENOMEM;
 794
 795         rapl_pmus->maxdie               = maxdie;
 796         rapl_pmus->pmu.attr_groups      = rapl_attr_groups;
 797         rapl_pmus->pmu.attr_update      = rapl_attr_update;
 798         rapl_pmus->pmu.task_ctx_nr      = perf_invalid_context;
 799         rapl_pmus->pmu.event_init       = rapl_pmu_event_init;
 800         rapl_pmus->pmu.add              = rapl_pmu_event_add;
 801         rapl_pmus->pmu.del              = rapl_pmu_event_del;
 802         rapl_pmus->pmu.start            = rapl_pmu_event_start;
 803         rapl_pmus->pmu.stop             = rapl_pmu_event_stop;
 804         rapl_pmus->pmu.read             = rapl_pmu_event_read;
 805         rapl_pmus->pmu.module           = THIS_MODULE;
 806         rapl_pmus->pmu.capabilities     = PERF_PMU_CAP_NO_EXCLUDE;
 807         return 0;
 808 }
 809
 810 #define X86_RAPL_MODEL_MATCH(model, init)       \
 811         { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init }
 812
 813 struct intel_rapl_init_fun {
 814         bool apply_quirk;
 815         int cntr_mask;
 816         struct attribute **attrs;
 817 };
 818
 819 static const struct intel_rapl_init_fun snb_rapl_init __initconst = {
 820         .apply_quirk = false,
 821         .cntr_mask = RAPL_IDX_CLN,
 822         .attrs = rapl_events_cln_attr,
 823 };
 824
 825 static const struct intel_rapl_init_fun hsx_rapl_init __initconst = {
 826         .apply_quirk = true,
 827         .cntr_mask = RAPL_IDX_SRV,
 828         .attrs = rapl_events_srv_attr,
 829 };
 830
 831 static const struct intel_rapl_init_fun hsw_rapl_init __initconst = {
 832         .apply_quirk = false,
 833         .cntr_mask = RAPL_IDX_HSW,
 834         .attrs = rapl_events_hsw_attr,
 835 };
 836
 837 static const struct intel_rapl_init_fun snbep_rapl_init __initconst = {
 838         .apply_quirk = false,
 839         .cntr_mask = RAPL_IDX_SRV,
 840         .attrs = rapl_events_srv_attr,
 841 };
 842
 843 static const struct intel_rapl_init_fun knl_rapl_init __initconst = {
 844         .apply_quirk = true,
 845         .cntr_mask = RAPL_IDX_KNL,
 846         .attrs = rapl_events_knl_attr,
 847 };
 848
 849 static const struct intel_rapl_init_fun skl_rapl_init __initconst = {
 850         .apply_quirk = false,
 851         .cntr_mask = RAPL_IDX_SKL_CLN,
 852         .attrs = rapl_events_skl_attr,
 853 };
 854
 855 static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
 856         X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE,   snb_rapl_init),
 857         X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, snbep_rapl_init),
 858
 859         X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE,   snb_rapl_init),
 860         X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, snbep_rapl_init),
 861
 862         X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_rapl_init),
 863         X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X,    hsx_rapl_init),
 864         X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT,  hsw_rapl_init),
 865         X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_rapl_init),
 866
 867         X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE,   hsw_rapl_init),
 868         X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E,   hsw_rapl_init),
 869         X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X,      hsx_rapl_init),
 870         X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsx_rapl_init),
 871
 872         X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init),
 873         X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, knl_rapl_init),
 874
 875         X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE,  skl_rapl_init),
 876         X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, skl_rapl_init),
 877         X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X,       hsx_rapl_init),
 878
 879         X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE,  skl_rapl_init),
 880         X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init),
 881
 882         X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_MOBILE,  skl_rapl_init),
 883
 884         X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init),
 885         X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, hsw_rapl_init),
 886
 887         X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, hsw_rapl_init),
 888
 889         X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE,  skl_rapl_init),
 890         X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_DESKTOP, skl_rapl_init),
 891         {},
 892 };
 893
 894 MODULE_DEVICE_TABLE(x86cpu, rapl_cpu_match);
 895
 896 static struct rapl_model model_snb = {
 897         .events         = BIT(PERF_RAPL_PP0) |
 898                           BIT(PERF_RAPL_PKG) |
 899                           BIT(PERF_RAPL_PP1),
 900         .apply_quirk    = false,
 901 };
 902
 903 static struct rapl_model model_snbep = {
 904         .events         = BIT(PERF_RAPL_PP0) |
 905                           BIT(PERF_RAPL_PKG) |
 906                           BIT(PERF_RAPL_RAM),
 907         .apply_quirk    = false,
 908 };
 909
 910 static struct rapl_model model_hsw = {
 911         .events         = BIT(PERF_RAPL_PP0) |
 912                           BIT(PERF_RAPL_PKG) |
 913                           BIT(PERF_RAPL_RAM) |
 914                           BIT(PERF_RAPL_PP1),
 915         .apply_quirk    = false,
 916 };
 917
 918 static struct rapl_model model_hsx = {
 919         .events         = BIT(PERF_RAPL_PP0) |
 920                           BIT(PERF_RAPL_PKG) |
 921                           BIT(PERF_RAPL_RAM),
 922         .apply_quirk    = true,
 923 };
 924
 925 static struct rapl_model model_knl = {
 926         .events         = BIT(PERF_RAPL_PKG) |
 927                           BIT(PERF_RAPL_RAM),
 928         .apply_quirk    = true,
 929 };
 930
 931 static struct rapl_model model_skl = {
 932         .events         = BIT(PERF_RAPL_PP0) |
 933                           BIT(PERF_RAPL_PKG) |
 934                           BIT(PERF_RAPL_RAM) |
 935                           BIT(PERF_RAPL_PP1) |
 936                           BIT(PERF_RAPL_PSYS),
 937         .apply_quirk    = false,
 938 };
 939
 940 static const struct x86_cpu_id rapl_model_match[] __initconst = {
 941         X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE,            model_snb),
 942         X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X,          model_snbep),
 943         X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE,              model_snb),
 944         X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X,            model_snbep),
 945         X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE,           model_hsw),
 946         X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X,              model_hsx),
 947         X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT,            model_hsw),
 948         X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E,           model_hsw),
 949         X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE,         model_hsw),
 950         X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E,         model_hsw),
 951         X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X,            model_hsx),
 952         X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D,       model_hsx),
 953         X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL,           model_knl),
 954         X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM,           model_knl),
 955         X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE,         model_skl),
 956         X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP,        model_skl),
 957         X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X,              model_hsx),
 958         X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE,        model_skl),
 959         X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP,       model_skl),
 960         X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_MOBILE,      model_skl),
 961         X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT,          model_hsw),
 962         X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X,        model_hsw),
 963         X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS,     model_hsw),
 964         X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE,         model_skl),
 965         X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_DESKTOP,        model_skl),
 966         {},
 967 };
 968
 969 static int __init rapl_pmu_init(void)
 970 {
 971         const struct x86_cpu_id *id;
 972         struct intel_rapl_init_fun *rapl_init;
 973         struct rapl_model *rm;
 974         bool apply_quirk;
 975         int ret;
 976
 977         id = x86_match_cpu(rapl_model_match);
 978         if (!id)
 979                 return -ENODEV;
 980
 981         rm = (struct rapl_model *) id->driver_data;
 982         perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, false, (void *) &rm->events);
 983
 984         id = x86_match_cpu(rapl_cpu_match);
 985         if (!id)
 986                 return -ENODEV;
 987
 988         rapl_init = (struct intel_rapl_init_fun *)id->driver_data;
 989         apply_quirk = rapl_init->apply_quirk;
 990         rapl_cntr_mask = rapl_init->cntr_mask;
 991         rapl_pmu_events_group.attrs = rapl_init->attrs;
 992
 993         ret = rapl_check_hw_unit(apply_quirk);
 994         if (ret)
 995                 return ret;
 996
 997         ret = init_rapl_pmus();
 998         if (ret)
 999                 return ret;
1000
1001         /*
1002          * Install callbacks. Core will call them for each online cpu.
1003          */
1004         ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
1005                                 "perf/x86/rapl:online",
1006                                 rapl_cpu_online, rapl_cpu_offline);
1007         if (ret)
1008                 goto out;
1009
1010         ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
1011         if (ret)
1012                 goto out1;
1013
1014         rapl_advertise();
1015         return 0;
1016
1017 out1:
1018         cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
1019 out:
1020         pr_warn("Initialization failed (%d), disabled\n", ret);
1021         cleanup_rapl_pmus();
1022         return ret;
1023 }
1024 module_init(rapl_pmu_init);
1025
1026 static void __exit intel_rapl_exit(void)
1027 {
1028         cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
1029         perf_pmu_unregister(&rapl_pmus->pmu);
1030         cleanup_rapl_pmus();
1031 }
1032 module_exit(intel_rapl_exit);