1 // SPDX-License-Identifier: GPL-2.0-only
3 * x86 APERF/MPERF KHz calculation for
4 * /sys/.../cpufreq/scaling_cur_freq
6 * Copyright (C) 2017 Intel Corp.
7 * Author: Len Brown <len.brown@intel.com>
9 #include <linux/cpufreq.h>
10 #include <linux/delay.h>
11 #include <linux/ktime.h>
12 #include <linux/math64.h>
13 #include <linux/percpu.h>
14 #include <linux/rcupdate.h>
15 #include <linux/sched/isolation.h>
16 #include <linux/sched/topology.h>
17 #include <linux/smp.h>
18 #include <linux/syscore_ops.h>
21 #include <asm/cpu_device_id.h>
22 #include <asm/intel-family.h>
28 unsigned long last_update;
35 static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
36 .seq = SEQCNT_ZERO(cpu_samples.seq)
39 static void init_counter_refs(void)
43 rdmsrl(MSR_IA32_APERF, aperf);
44 rdmsrl(MSR_IA32_MPERF, mperf);
46 this_cpu_write(cpu_samples.aperf, aperf);
47 this_cpu_write(cpu_samples.mperf, mperf);
50 #if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
52 * APERF/MPERF frequency ratio computation.
54 * The scheduler wants to do frequency invariant accounting and needs a <1
55 * ratio to account for the 'current' frequency, corresponding to
56 * freq_curr / freq_max.
58 * Since the frequency freq_curr on x86 is controlled by micro-controller and
59 * our P-state setting is little more than a request/hint, we need to observe
60 * the effective frequency 'BusyMHz', i.e. the average frequency over a time
61 * interval after discarding idle time. This is given by:
63 * BusyMHz = delta_APERF / delta_MPERF * freq_base
65 * where freq_base is the max non-turbo P-state.
67 * The freq_max term has to be set to a somewhat arbitrary value, because we
68 * can't know which turbo states will be available at a given point in time:
69 * it all depends on the thermal headroom of the entire package. We set it to
70 * the turbo level with 4 cores active.
72 * Benchmarks show that's a good compromise between the 1C turbo ratio
73 * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
74 * which would ignore the entire turbo range (a conspicuous part, making
75 * freq_curr/freq_max always maxed out).
77 * An exception to the heuristic above is the Atom uarch, where we choose the
78 * highest turbo level for freq_max since Atom's are generally oriented towards
81 * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
82 * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
85 DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
87 static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
88 static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
90 void arch_set_max_freq_ratio(bool turbo_disabled)
92 arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
93 arch_turbo_freq_ratio;
95 EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
97 static bool __init turbo_disabled(void)
102 err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
106 return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
109 static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
113 err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
117 err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
121 *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
122 *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
127 #define X86_MATCH(vfm) \
128 X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)
130 static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
131 X86_MATCH(INTEL_XEON_PHI_KNL),
132 X86_MATCH(INTEL_XEON_PHI_KNM),
136 static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
137 X86_MATCH(INTEL_SKYLAKE_X),
141 static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
142 X86_MATCH(INTEL_ATOM_GOLDMONT),
143 X86_MATCH(INTEL_ATOM_GOLDMONT_D),
144 X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
148 static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
149 int num_delta_fratio)
151 int fratio, delta_fratio, found;
155 err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
159 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
161 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
165 fratio = (msr >> 8) & 0xFF;
169 if (found >= num_delta_fratio) {
170 *turbo_freq = fratio;
174 delta_fratio = (msr >> (i + 5)) & 0x7;
178 fratio -= delta_fratio;
187 static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
193 err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
197 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
199 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
203 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
207 for (i = 0; i < 64; i += 8) {
208 group_size = (counts >> i) & 0xFF;
209 if (group_size >= size) {
210 *turbo_freq = (ratios >> i) & 0xFF;
218 static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
223 err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
227 err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
231 *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
232 *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
234 /* The CPU may have less than 4 cores */
236 *turbo_freq = msr & 0xFF; /* 1C turbo */
241 static bool __init intel_set_max_freq_ratio(void)
243 u64 base_freq, turbo_freq;
246 if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
249 if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
250 skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
253 if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
254 knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
257 if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
258 skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
261 if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
268 * Some hypervisors advertise X86_FEATURE_APERFMPERF
269 * but then fill all MSR's with zeroes.
270 * Some CPUs have turbo boost but don't declare any turbo ratio
271 * in MSR_TURBO_RATIO_LIMIT.
273 if (!base_freq || !turbo_freq) {
274 pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
278 turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
280 pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
284 arch_turbo_freq_ratio = turbo_ratio;
285 arch_set_max_freq_ratio(turbo_disabled());
290 #ifdef CONFIG_PM_SLEEP
291 static struct syscore_ops freq_invariance_syscore_ops = {
292 .resume = init_counter_refs,
295 static void register_freq_invariance_syscore_ops(void)
297 register_syscore_ops(&freq_invariance_syscore_ops);
300 static inline void register_freq_invariance_syscore_ops(void) {}
303 static void freq_invariance_enable(void)
305 if (static_branch_unlikely(&arch_scale_freq_key)) {
309 static_branch_enable(&arch_scale_freq_key);
310 register_freq_invariance_syscore_ops();
311 pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
314 void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
316 arch_turbo_freq_ratio = ratio;
317 arch_set_max_freq_ratio(turbo_disabled);
318 freq_invariance_enable();
321 static void __init bp_init_freq_invariance(void)
323 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
326 if (intel_set_max_freq_ratio())
327 freq_invariance_enable();
330 static void disable_freq_invariance_workfn(struct work_struct *work)
334 static_branch_disable(&arch_scale_freq_key);
337 * Set arch_freq_scale to a default value on all cpus
338 * This negates the effect of scaling
340 for_each_possible_cpu(cpu)
341 per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE;
344 static DECLARE_WORK(disable_freq_invariance_work,
345 disable_freq_invariance_workfn);
347 DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
348 EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
350 static void scale_freq_tick(u64 acnt, u64 mcnt)
354 if (!arch_scale_freq_invariant())
357 if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
360 if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
363 freq_scale = div64_u64(acnt, mcnt);
367 if (freq_scale > SCHED_CAPACITY_SCALE)
368 freq_scale = SCHED_CAPACITY_SCALE;
370 this_cpu_write(arch_freq_scale, freq_scale);
374 pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
375 schedule_work(&disable_freq_invariance_work);
378 static inline void bp_init_freq_invariance(void) { }
379 static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
380 #endif /* CONFIG_X86_64 && CONFIG_SMP */
382 void arch_scale_freq_tick(void)
384 struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
385 u64 acnt, mcnt, aperf, mperf;
387 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
390 rdmsrl(MSR_IA32_APERF, aperf);
391 rdmsrl(MSR_IA32_MPERF, mperf);
392 acnt = aperf - s->aperf;
393 mcnt = mperf - s->mperf;
398 raw_write_seqcount_begin(&s->seq);
399 s->last_update = jiffies;
402 raw_write_seqcount_end(&s->seq);
404 scale_freq_tick(acnt, mcnt);
408 * Discard samples older than the define maximum sample age of 20ms. There
409 * is no point in sending IPIs in such a case. If the scheduler tick was
410 * not running then the CPU is either idle or isolated.
412 #define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
414 unsigned int arch_freq_get_on_cpu(int cpu)
416 struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
417 unsigned int seq, freq;
421 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
425 seq = raw_read_seqcount_begin(&s->seq);
426 last = s->last_update;
429 } while (read_seqcount_retry(&s->seq, seq));
432 * Bail on invalid count and when the last update was too long ago,
433 * which covers idle and NOHZ full CPUs.
435 if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
438 return div64_u64((cpu_khz * acnt), mcnt);
441 freq = cpufreq_quick_get(cpu);
442 return freq ? freq : cpu_khz;
445 static int __init bp_init_aperfmperf(void)
447 if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
451 bp_init_freq_invariance();
454 early_initcall(bp_init_aperfmperf);
456 void ap_init_aperfmperf(void)
458 if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))