arch/x86/kernel/cpu/perf_event_intel_rapl.c

   1 /*
   2  * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
   3  * Copyright (C) 2013 Google, Inc., Stephane Eranian
   4  *
   5  * Intel RAPL interface is specified in the IA-32 Manual Vol3b
   6  * section 14.7.1 (September 2013)
   7  *
   8  * RAPL provides more controls than just reporting energy consumption
   9  * however here we only expose the 3 energy consumption free running
  10  * counters (pp0, pkg, dram).
  11  *
  12  * Each of those counters increments in a power unit defined by the
  13  * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
  14  * but it can vary.
  15  *
  16  * Counter to rapl events mappings:
  17  *
  18  *  pp0 counter: consumption of all physical cores (power plane 0)
  19  *        event: rapl_energy_cores
  20  *    perf code: 0x1
  21  *
  22  *  pkg counter: consumption of the whole processor package
  23  *        event: rapl_energy_pkg
  24  *    perf code: 0x2
  25  *
  26  * dram counter: consumption of the dram domain (servers only)
  27  *        event: rapl_energy_dram
  28  *    perf code: 0x3
  29  *
  30  * dram counter: consumption of the builtin-gpu domain (client only)
  31  *        event: rapl_energy_gpu
  32  *    perf code: 0x4
  33  *
  34  * We manage those counters as free running (read-only). They may be
  35  * use simultaneously by other tools, such as turbostat.
  36  *
  37  * The events only support system-wide mode counting. There is no
  38  * sampling support because it does not make sense and is not
  39  * supported by the RAPL hardware.
  40  *
  41  * Because we want to avoid floating-point operations in the kernel,
  42  * the events are all reported in fixed point arithmetic (32.32).
  43  * Tools must adjust the counts to convert them to Watts using
  44  * the duration of the measurement. Tools may use a function such as
  45  * ldexp(raw_count, -32);
  46  */
  47 #include <linux/module.h>
  48 #include <linux/slab.h>
  49 #include <linux/perf_event.h>
  50 #include <asm/cpu_device_id.h>
  51 #include "perf_event.h"
  52
  53 /*
  54  * RAPL energy status counters
  55  */
  56 #define RAPL_IDX_PP0_NRG_STAT   0       /* all cores */
  57 #define INTEL_RAPL_PP0          0x1     /* pseudo-encoding */
  58 #define RAPL_IDX_PKG_NRG_STAT   1       /* entire package */
  59 #define INTEL_RAPL_PKG          0x2     /* pseudo-encoding */
  60 #define RAPL_IDX_RAM_NRG_STAT   2       /* DRAM */
  61 #define INTEL_RAPL_RAM          0x3     /* pseudo-encoding */
  62 #define RAPL_IDX_PP1_NRG_STAT   3       /* gpu */
  63 #define INTEL_RAPL_PP1          0x4     /* pseudo-encoding */
  64
  65 /* Clients have PP0, PKG */
  66 #define RAPL_IDX_CLN    (1<<RAPL_IDX_PP0_NRG_STAT|\
  67                          1<<RAPL_IDX_PKG_NRG_STAT|\
  68                          1<<RAPL_IDX_PP1_NRG_STAT)
  69
  70 /* Servers have PP0, PKG, RAM */
  71 #define RAPL_IDX_SRV    (1<<RAPL_IDX_PP0_NRG_STAT|\
  72                          1<<RAPL_IDX_PKG_NRG_STAT|\
  73                          1<<RAPL_IDX_RAM_NRG_STAT)
  74
  75 /* Servers have PP0, PKG, RAM, PP1 */
  76 #define RAPL_IDX_HSW    (1<<RAPL_IDX_PP0_NRG_STAT|\
  77                          1<<RAPL_IDX_PKG_NRG_STAT|\
  78                          1<<RAPL_IDX_RAM_NRG_STAT|\
  79                          1<<RAPL_IDX_PP1_NRG_STAT)
  80
  81 /*
  82  * event code: LSB 8 bits, passed in attr->config
  83  * any other bit is reserved
  84  */
  85 #define RAPL_EVENT_MASK 0xFFULL
  86
  87 #define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)           \
  88 static ssize_t __rapl_##_var##_show(struct kobject *kobj,       \
  89                                 struct kobj_attribute *attr,    \
  90                                 char *page)                     \
  91 {                                                               \
  92         BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);             \
  93         return sprintf(page, _format "\n");                     \
  94 }                                                               \
  95 static struct kobj_attribute format_attr_##_var =               \
  96         __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
  97
  98 #define RAPL_EVENT_DESC(_name, _config)                         \
  99 {                                                               \
 100         .attr   = __ATTR(_name, 0444, rapl_event_show, NULL),   \
 101         .config = _config,                                      \
 102 }
 103
 104 #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
 105
 106 struct rapl_pmu {
 107         spinlock_t       lock;
 108         int              hw_unit;  /* 1/2^hw_unit Joule */
 109         int              n_active; /* number of active events */
 110         struct list_head active_list;
 111         struct pmu       *pmu; /* pointer to rapl_pmu_class */
 112         ktime_t          timer_interval; /* in ktime_t unit */
 113         struct hrtimer   hrtimer;
 114 };
 115
 116 static struct pmu rapl_pmu_class;
 117 static cpumask_t rapl_cpu_mask;
 118 static int rapl_cntr_mask;
 119
 120 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
 121 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
 122
 123 static inline u64 rapl_read_counter(struct perf_event *event)
 124 {
 125         u64 raw;
 126         rdmsrl(event->hw.event_base, raw);
 127         return raw;
 128 }
 129
 130 static inline u64 rapl_scale(u64 v)
 131 {
 132         /*
 133          * scale delta to smallest unit (1/2^32)
 134          * users must then scale back: count * 1/(1e9*2^32) to get Joules
 135          * or use ldexp(count, -32).
 136          * Watts = Joules/Time delta
 137          */
 138         return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit);
 139 }
 140
 141 static u64 rapl_event_update(struct perf_event *event)
 142 {
 143         struct hw_perf_event *hwc = &event->hw;
 144         u64 prev_raw_count, new_raw_count;
 145         s64 delta, sdelta;
 146         int shift = RAPL_CNTR_WIDTH;
 147
 148 again:
 149         prev_raw_count = local64_read(&hwc->prev_count);
 150         rdmsrl(event->hw.event_base, new_raw_count);
 151
 152         if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 153                             new_raw_count) != prev_raw_count) {
 154                 cpu_relax();
 155                 goto again;
 156         }
 157
 158         /*
 159          * Now we have the new raw value and have updated the prev
 160          * timestamp already. We can now calculate the elapsed delta
 161          * (event-)time and add that to the generic event.
 162          *
 163          * Careful, not all hw sign-extends above the physical width
 164          * of the count.
 165          */
 166         delta = (new_raw_count << shift) - (prev_raw_count << shift);
 167         delta >>= shift;
 168
 169         sdelta = rapl_scale(delta);
 170
 171         local64_add(sdelta, &event->count);
 172
 173         return new_raw_count;
 174 }
 175
 176 static void rapl_start_hrtimer(struct rapl_pmu *pmu)
 177 {
 178         __hrtimer_start_range_ns(&pmu->hrtimer,
 179                         pmu->timer_interval, 0,
 180                         HRTIMER_MODE_REL_PINNED, 0);
 181 }
 182
 183 static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
 184 {
 185         hrtimer_cancel(&pmu->hrtimer);
 186 }
 187
 188 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
 189 {
 190         struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
 191         struct perf_event *event;
 192         unsigned long flags;
 193
 194         if (!pmu->n_active)
 195                 return HRTIMER_NORESTART;
 196
 197         spin_lock_irqsave(&pmu->lock, flags);
 198
 199         list_for_each_entry(event, &pmu->active_list, active_entry) {
 200                 rapl_event_update(event);
 201         }
 202
 203         spin_unlock_irqrestore(&pmu->lock, flags);
 204
 205         hrtimer_forward_now(hrtimer, pmu->timer_interval);
 206
 207         return HRTIMER_RESTART;
 208 }
 209
 210 static void rapl_hrtimer_init(struct rapl_pmu *pmu)
 211 {
 212         struct hrtimer *hr = &pmu->hrtimer;
 213
 214         hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 215         hr->function = rapl_hrtimer_handle;
 216 }
 217
 218 static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
 219                                    struct perf_event *event)
 220 {
 221         if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
 222                 return;
 223
 224         event->hw.state = 0;
 225
 226         list_add_tail(&event->active_entry, &pmu->active_list);
 227
 228         local64_set(&event->hw.prev_count, rapl_read_counter(event));
 229
 230         pmu->n_active++;
 231         if (pmu->n_active == 1)
 232                 rapl_start_hrtimer(pmu);
 233 }
 234
 235 static void rapl_pmu_event_start(struct perf_event *event, int mode)
 236 {
 237         struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
 238         unsigned long flags;
 239
 240         spin_lock_irqsave(&pmu->lock, flags);
 241         __rapl_pmu_event_start(pmu, event);
 242         spin_unlock_irqrestore(&pmu->lock, flags);
 243 }
 244
 245 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 246 {
 247         struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
 248         struct hw_perf_event *hwc = &event->hw;
 249         unsigned long flags;
 250
 251         spin_lock_irqsave(&pmu->lock, flags);
 252
 253         /* mark event as deactivated and stopped */
 254         if (!(hwc->state & PERF_HES_STOPPED)) {
 255                 WARN_ON_ONCE(pmu->n_active <= 0);
 256                 pmu->n_active--;
 257                 if (pmu->n_active == 0)
 258                         rapl_stop_hrtimer(pmu);
 259
 260                 list_del(&event->active_entry);
 261
 262                 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
 263                 hwc->state |= PERF_HES_STOPPED;
 264         }
 265
 266         /* check if update of sw counter is necessary */
 267         if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
 268                 /*
 269                  * Drain the remaining delta count out of a event
 270                  * that we are disabling:
 271                  */
 272                 rapl_event_update(event);
 273                 hwc->state |= PERF_HES_UPTODATE;
 274         }
 275
 276         spin_unlock_irqrestore(&pmu->lock, flags);
 277 }
 278
 279 static int rapl_pmu_event_add(struct perf_event *event, int mode)
 280 {
 281         struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
 282         struct hw_perf_event *hwc = &event->hw;
 283         unsigned long flags;
 284
 285         spin_lock_irqsave(&pmu->lock, flags);
 286
 287         hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 288
 289         if (mode & PERF_EF_START)
 290                 __rapl_pmu_event_start(pmu, event);
 291
 292         spin_unlock_irqrestore(&pmu->lock, flags);
 293
 294         return 0;
 295 }
 296
 297 static void rapl_pmu_event_del(struct perf_event *event, int flags)
 298 {
 299         rapl_pmu_event_stop(event, PERF_EF_UPDATE);
 300 }
 301
 302 static int rapl_pmu_event_init(struct perf_event *event)
 303 {
 304         u64 cfg = event->attr.config & RAPL_EVENT_MASK;
 305         int bit, msr, ret = 0;
 306
 307         /* only look at RAPL events */
 308         if (event->attr.type != rapl_pmu_class.type)
 309                 return -ENOENT;
 310
 311         /* check only supported bits are set */
 312         if (event->attr.config & ~RAPL_EVENT_MASK)
 313                 return -EINVAL;
 314
 315         /*
 316          * check event is known (determines counter)
 317          */
 318         switch (cfg) {
 319         case INTEL_RAPL_PP0:
 320                 bit = RAPL_IDX_PP0_NRG_STAT;
 321                 msr = MSR_PP0_ENERGY_STATUS;
 322                 break;
 323         case INTEL_RAPL_PKG:
 324                 bit = RAPL_IDX_PKG_NRG_STAT;
 325                 msr = MSR_PKG_ENERGY_STATUS;
 326                 break;
 327         case INTEL_RAPL_RAM:
 328                 bit = RAPL_IDX_RAM_NRG_STAT;
 329                 msr = MSR_DRAM_ENERGY_STATUS;
 330                 break;
 331         case INTEL_RAPL_PP1:
 332                 bit = RAPL_IDX_PP1_NRG_STAT;
 333                 msr = MSR_PP1_ENERGY_STATUS;
 334                 break;
 335         default:
 336                 return -EINVAL;
 337         }
 338         /* check event supported */
 339         if (!(rapl_cntr_mask & (1 << bit)))
 340                 return -EINVAL;
 341
 342         /* unsupported modes and filters */
 343         if (event->attr.exclude_user   ||
 344             event->attr.exclude_kernel ||
 345             event->attr.exclude_hv     ||
 346             event->attr.exclude_idle   ||
 347             event->attr.exclude_host   ||
 348             event->attr.exclude_guest  ||
 349             event->attr.sample_period) /* no sampling */
 350                 return -EINVAL;
 351
 352         /* must be done before validate_group */
 353         event->hw.event_base = msr;
 354         event->hw.config = cfg;
 355         event->hw.idx = bit;
 356
 357         return ret;
 358 }
 359
 360 static void rapl_pmu_event_read(struct perf_event *event)
 361 {
 362         rapl_event_update(event);
 363 }
 364
 365 static ssize_t rapl_get_attr_cpumask(struct device *dev,
 366                                 struct device_attribute *attr, char *buf)
 367 {
 368         int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask);
 369
 370         buf[n++] = '\n';
 371         buf[n] = '\0';
 372         return n;
 373 }
 374
 375 static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
 376
 377 static struct attribute *rapl_pmu_attrs[] = {
 378         &dev_attr_cpumask.attr,
 379         NULL,
 380 };
 381
 382 static struct attribute_group rapl_pmu_attr_group = {
 383         .attrs = rapl_pmu_attrs,
 384 };
 385
 386 EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
 387 EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
 388 EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
 389 EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
 390
 391 EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
 392 EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
 393 EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
 394 EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
 395
 396 /*
 397  * we compute in 0.23 nJ increments regardless of MSR
 398  */
 399 EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
 400 EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
 401 EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
 402 EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
 403
 404 static struct attribute *rapl_events_srv_attr[] = {
 405         EVENT_PTR(rapl_cores),
 406         EVENT_PTR(rapl_pkg),
 407         EVENT_PTR(rapl_ram),
 408
 409         EVENT_PTR(rapl_cores_unit),
 410         EVENT_PTR(rapl_pkg_unit),
 411         EVENT_PTR(rapl_ram_unit),
 412
 413         EVENT_PTR(rapl_cores_scale),
 414         EVENT_PTR(rapl_pkg_scale),
 415         EVENT_PTR(rapl_ram_scale),
 416         NULL,
 417 };
 418
 419 static struct attribute *rapl_events_cln_attr[] = {
 420         EVENT_PTR(rapl_cores),
 421         EVENT_PTR(rapl_pkg),
 422         EVENT_PTR(rapl_gpu),
 423
 424         EVENT_PTR(rapl_cores_unit),
 425         EVENT_PTR(rapl_pkg_unit),
 426         EVENT_PTR(rapl_gpu_unit),
 427
 428         EVENT_PTR(rapl_cores_scale),
 429         EVENT_PTR(rapl_pkg_scale),
 430         EVENT_PTR(rapl_gpu_scale),
 431         NULL,
 432 };
 433
 434 static struct attribute *rapl_events_hsw_attr[] = {
 435         EVENT_PTR(rapl_cores),
 436         EVENT_PTR(rapl_pkg),
 437         EVENT_PTR(rapl_gpu),
 438         EVENT_PTR(rapl_ram),
 439
 440         EVENT_PTR(rapl_cores_unit),
 441         EVENT_PTR(rapl_pkg_unit),
 442         EVENT_PTR(rapl_gpu_unit),
 443         EVENT_PTR(rapl_ram_unit),
 444
 445         EVENT_PTR(rapl_cores_scale),
 446         EVENT_PTR(rapl_pkg_scale),
 447         EVENT_PTR(rapl_gpu_scale),
 448         EVENT_PTR(rapl_ram_scale),
 449         NULL,
 450 };
 451
 452 static struct attribute_group rapl_pmu_events_group = {
 453         .name = "events",
 454         .attrs = NULL, /* patched at runtime */
 455 };
 456
 457 DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
 458 static struct attribute *rapl_formats_attr[] = {
 459         &format_attr_event.attr,
 460         NULL,
 461 };
 462
 463 static struct attribute_group rapl_pmu_format_group = {
 464         .name = "format",
 465         .attrs = rapl_formats_attr,
 466 };
 467
 468 const struct attribute_group *rapl_attr_groups[] = {
 469         &rapl_pmu_attr_group,
 470         &rapl_pmu_format_group,
 471         &rapl_pmu_events_group,
 472         NULL,
 473 };
 474
 475 static struct pmu rapl_pmu_class = {
 476         .attr_groups    = rapl_attr_groups,
 477         .task_ctx_nr    = perf_invalid_context, /* system-wide only */
 478         .event_init     = rapl_pmu_event_init,
 479         .add            = rapl_pmu_event_add, /* must have */
 480         .del            = rapl_pmu_event_del, /* must have */
 481         .start          = rapl_pmu_event_start,
 482         .stop           = rapl_pmu_event_stop,
 483         .read           = rapl_pmu_event_read,
 484 };
 485
 486 static void rapl_cpu_exit(int cpu)
 487 {
 488         struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
 489         int i, phys_id = topology_physical_package_id(cpu);
 490         int target = -1;
 491
 492         /* find a new cpu on same package */
 493         for_each_online_cpu(i) {
 494                 if (i == cpu)
 495                         continue;
 496                 if (phys_id == topology_physical_package_id(i)) {
 497                         target = i;
 498                         break;
 499                 }
 500         }
 501         /*
 502          * clear cpu from cpumask
 503          * if was set in cpumask and still some cpu on package,
 504          * then move to new cpu
 505          */
 506         if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
 507                 cpumask_set_cpu(target, &rapl_cpu_mask);
 508
 509         WARN_ON(cpumask_empty(&rapl_cpu_mask));
 510         /*
 511          * migrate events and context to new cpu
 512          */
 513         if (target >= 0)
 514                 perf_pmu_migrate_context(pmu->pmu, cpu, target);
 515
 516         /* cancel overflow polling timer for CPU */
 517         rapl_stop_hrtimer(pmu);
 518 }
 519
 520 static void rapl_cpu_init(int cpu)
 521 {
 522         int i, phys_id = topology_physical_package_id(cpu);
 523
 524         /* check if phys_is is already covered */
 525         for_each_cpu(i, &rapl_cpu_mask) {
 526                 if (phys_id == topology_physical_package_id(i))
 527                         return;
 528         }
 529         /* was not found, so add it */
 530         cpumask_set_cpu(cpu, &rapl_cpu_mask);
 531 }
 532
 533 static int rapl_cpu_prepare(int cpu)
 534 {
 535         struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
 536         int phys_id = topology_physical_package_id(cpu);
 537         u64 ms;
 538         u64 msr_rapl_power_unit_bits;
 539
 540         if (pmu)
 541                 return 0;
 542
 543         if (phys_id < 0)
 544                 return -1;
 545
 546         /* protect rdmsrl() to handle virtualization */
 547         if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
 548                 return -1;
 549
 550         pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
 551         if (!pmu)
 552                 return -1;
 553
 554         spin_lock_init(&pmu->lock);
 555
 556         INIT_LIST_HEAD(&pmu->active_list);
 557
 558         /*
 559          * grab power unit as: 1/2^unit Joules
 560          *
 561          * we cache in local PMU instance
 562          */
 563         pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
 564         pmu->pmu = &rapl_pmu_class;
 565
 566         /*
 567          * use reference of 200W for scaling the timeout
 568          * to avoid missing counter overflows.
 569          * 200W = 200 Joules/sec
 570          * divide interval by 2 to avoid lockstep (2 * 100)
 571          * if hw unit is 32, then we use 2 ms 1/200/2
 572          */
 573         if (pmu->hw_unit < 32)
 574                 ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
 575         else
 576                 ms = 2;
 577
 578         pmu->timer_interval = ms_to_ktime(ms);
 579
 580         rapl_hrtimer_init(pmu);
 581
 582         /* set RAPL pmu for this cpu for now */
 583         per_cpu(rapl_pmu, cpu) = pmu;
 584         per_cpu(rapl_pmu_to_free, cpu) = NULL;
 585
 586         return 0;
 587 }
 588
 589 static void rapl_cpu_kfree(int cpu)
 590 {
 591         struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
 592
 593         kfree(pmu);
 594
 595         per_cpu(rapl_pmu_to_free, cpu) = NULL;
 596 }
 597
 598 static int rapl_cpu_dying(int cpu)
 599 {
 600         struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
 601
 602         if (!pmu)
 603                 return 0;
 604
 605         per_cpu(rapl_pmu, cpu) = NULL;
 606
 607         per_cpu(rapl_pmu_to_free, cpu) = pmu;
 608
 609         return 0;
 610 }
 611
 612 static int rapl_cpu_notifier(struct notifier_block *self,
 613                              unsigned long action, void *hcpu)
 614 {
 615         unsigned int cpu = (long)hcpu;
 616
 617         switch (action & ~CPU_TASKS_FROZEN) {
 618         case CPU_UP_PREPARE:
 619                 rapl_cpu_prepare(cpu);
 620                 break;
 621         case CPU_STARTING:
 622                 rapl_cpu_init(cpu);
 623                 break;
 624         case CPU_UP_CANCELED:
 625         case CPU_DYING:
 626                 rapl_cpu_dying(cpu);
 627                 break;
 628         case CPU_ONLINE:
 629         case CPU_DEAD:
 630                 rapl_cpu_kfree(cpu);
 631                 break;
 632         case CPU_DOWN_PREPARE:
 633                 rapl_cpu_exit(cpu);
 634                 break;
 635         default:
 636                 break;
 637         }
 638
 639         return NOTIFY_OK;
 640 }
 641
 642 static const struct x86_cpu_id rapl_cpu_match[] = {
 643         [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
 644         [1] = {},
 645 };
 646
 647 static int __init rapl_pmu_init(void)
 648 {
 649         struct rapl_pmu *pmu;
 650         int cpu, ret;
 651
 652         /*
 653          * check for Intel processor family 6
 654          */
 655         if (!x86_match_cpu(rapl_cpu_match))
 656                 return 0;
 657
 658         /* check supported CPU */
 659         switch (boot_cpu_data.x86_model) {
 660         case 42: /* Sandy Bridge */
 661         case 58: /* Ivy Bridge */
 662                 rapl_cntr_mask = RAPL_IDX_CLN;
 663                 rapl_pmu_events_group.attrs = rapl_events_cln_attr;
 664                 break;
 665         case 60: /* Haswell */
 666         case 69: /* Haswell-Celeron */
 667                 rapl_cntr_mask = RAPL_IDX_HSW;
 668                 rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
 669                 break;
 670         case 45: /* Sandy Bridge-EP */
 671         case 62: /* IvyTown */
 672                 rapl_cntr_mask = RAPL_IDX_SRV;
 673                 rapl_pmu_events_group.attrs = rapl_events_srv_attr;
 674                 break;
 675
 676         default:
 677                 /* unsupported */
 678                 return 0;
 679         }
 680
 681         cpu_notifier_register_begin();
 682
 683         for_each_online_cpu(cpu) {
 684                 ret = rapl_cpu_prepare(cpu);
 685                 if (ret)
 686                         goto out;
 687                 rapl_cpu_init(cpu);
 688         }
 689
 690         __perf_cpu_notifier(rapl_cpu_notifier);
 691
 692         ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
 693         if (WARN_ON(ret)) {
 694                 pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
 695                 cpu_notifier_register_done();
 696                 return -1;
 697         }
 698
 699         pmu = __get_cpu_var(rapl_pmu);
 700
 701         pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
 702                 " API unit is 2^-32 Joules,"
 703                 " %d fixed counters"
 704                 " %llu ms ovfl timer\n",
 705                 pmu->hw_unit,
 706                 hweight32(rapl_cntr_mask),
 707                 ktime_to_ms(pmu->timer_interval));
 708
 709 out:
 710         cpu_notifier_register_done();
 711
 712         return 0;
 713 }
 714 device_initcall(rapl_pmu_init);