KVM: SVM: Add support for CR0 write traps for an SEV-ES guest
[linux-2.6-microblaze.git] / arch / x86 / kvm / x86.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * derived from drivers/kvm/kvm_main.c
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright (C) 2008 Qumranet, Inc.
9  * Copyright IBM Corporation, 2008
10  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11  *
12  * Authors:
13  *   Avi Kivity   <avi@qumranet.com>
14  *   Yaniv Kamay  <yaniv@qumranet.com>
15  *   Amit Shah    <amit.shah@qumranet.com>
16  *   Ben-Ami Yassour <benami@il.ibm.com>
17  */
18
19 #include <linux/kvm_host.h>
20 #include "irq.h"
21 #include "ioapic.h"
22 #include "mmu.h"
23 #include "i8254.h"
24 #include "tss.h"
25 #include "kvm_cache_regs.h"
26 #include "kvm_emulate.h"
27 #include "x86.h"
28 #include "cpuid.h"
29 #include "pmu.h"
30 #include "hyperv.h"
31 #include "lapic.h"
32
33 #include <linux/clocksource.h>
34 #include <linux/interrupt.h>
35 #include <linux/kvm.h>
36 #include <linux/fs.h>
37 #include <linux/vmalloc.h>
38 #include <linux/export.h>
39 #include <linux/moduleparam.h>
40 #include <linux/mman.h>
41 #include <linux/highmem.h>
42 #include <linux/iommu.h>
43 #include <linux/intel-iommu.h>
44 #include <linux/cpufreq.h>
45 #include <linux/user-return-notifier.h>
46 #include <linux/srcu.h>
47 #include <linux/slab.h>
48 #include <linux/perf_event.h>
49 #include <linux/uaccess.h>
50 #include <linux/hash.h>
51 #include <linux/pci.h>
52 #include <linux/timekeeper_internal.h>
53 #include <linux/pvclock_gtod.h>
54 #include <linux/kvm_irqfd.h>
55 #include <linux/irqbypass.h>
56 #include <linux/sched/stat.h>
57 #include <linux/sched/isolation.h>
58 #include <linux/mem_encrypt.h>
59 #include <linux/entry-kvm.h>
60
61 #include <trace/events/kvm.h>
62
63 #include <asm/debugreg.h>
64 #include <asm/msr.h>
65 #include <asm/desc.h>
66 #include <asm/mce.h>
67 #include <linux/kernel_stat.h>
68 #include <asm/fpu/internal.h> /* Ugh! */
69 #include <asm/pvclock.h>
70 #include <asm/div64.h>
71 #include <asm/irq_remapping.h>
72 #include <asm/mshyperv.h>
73 #include <asm/hypervisor.h>
74 #include <asm/tlbflush.h>
75 #include <asm/intel_pt.h>
76 #include <asm/emulate_prefix.h>
77 #include <clocksource/hyperv_timer.h>
78
79 #define CREATE_TRACE_POINTS
80 #include "trace.h"
81
82 #define MAX_IO_MSRS 256
83 #define KVM_MAX_MCE_BANKS 32
84 u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
85 EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
86
87 #define emul_to_vcpu(ctxt) \
88         ((struct kvm_vcpu *)(ctxt)->vcpu)
89
90 /* EFER defaults:
91  * - enable syscall per default because its emulated by KVM
92  * - enable LME and LMA per default on 64 bit KVM
93  */
94 #ifdef CONFIG_X86_64
95 static
96 u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
97 #else
98 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
99 #endif
100
101 static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
102
103 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
104                                     KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
105
106 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
107 static void process_nmi(struct kvm_vcpu *vcpu);
108 static void enter_smm(struct kvm_vcpu *vcpu);
109 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
110 static void store_regs(struct kvm_vcpu *vcpu);
111 static int sync_regs(struct kvm_vcpu *vcpu);
112
113 struct kvm_x86_ops kvm_x86_ops __read_mostly;
114 EXPORT_SYMBOL_GPL(kvm_x86_ops);
115
116 static bool __read_mostly ignore_msrs = 0;
117 module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
118
119 static bool __read_mostly report_ignored_msrs = true;
120 module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
121
122 unsigned int min_timer_period_us = 200;
123 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
124
125 static bool __read_mostly kvmclock_periodic_sync = true;
126 module_param(kvmclock_periodic_sync, bool, S_IRUGO);
127
128 bool __read_mostly kvm_has_tsc_control;
129 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
130 u32  __read_mostly kvm_max_guest_tsc_khz;
131 EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
132 u8   __read_mostly kvm_tsc_scaling_ratio_frac_bits;
133 EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
134 u64  __read_mostly kvm_max_tsc_scaling_ratio;
135 EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
136 u64 __read_mostly kvm_default_tsc_scaling_ratio;
137 EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
138
139 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
140 static u32 __read_mostly tsc_tolerance_ppm = 250;
141 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
142
143 /*
144  * lapic timer advance (tscdeadline mode only) in nanoseconds.  '-1' enables
145  * adaptive tuning starting from default advancment of 1000ns.  '0' disables
146  * advancement entirely.  Any other value is used as-is and disables adaptive
147  * tuning, i.e. allows priveleged userspace to set an exact advancement time.
148  */
149 static int __read_mostly lapic_timer_advance_ns = -1;
150 module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
151
152 static bool __read_mostly vector_hashing = true;
153 module_param(vector_hashing, bool, S_IRUGO);
154
155 bool __read_mostly enable_vmware_backdoor = false;
156 module_param(enable_vmware_backdoor, bool, S_IRUGO);
157 EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
158
159 static bool __read_mostly force_emulation_prefix = false;
160 module_param(force_emulation_prefix, bool, S_IRUGO);
161
162 int __read_mostly pi_inject_timer = -1;
163 module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
164
165 /*
166  * Restoring the host value for MSRs that are only consumed when running in
167  * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
168  * returns to userspace, i.e. the kernel can run with the guest's value.
169  */
170 #define KVM_MAX_NR_USER_RETURN_MSRS 16
171
172 struct kvm_user_return_msrs_global {
173         int nr;
174         u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
175 };
176
177 struct kvm_user_return_msrs {
178         struct user_return_notifier urn;
179         bool registered;
180         struct kvm_user_return_msr_values {
181                 u64 host;
182                 u64 curr;
183         } values[KVM_MAX_NR_USER_RETURN_MSRS];
184 };
185
186 static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
187 static struct kvm_user_return_msrs __percpu *user_return_msrs;
188
189 #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
190                                 | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
191                                 | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
192                                 | XFEATURE_MASK_PKRU)
193
194 u64 __read_mostly host_efer;
195 EXPORT_SYMBOL_GPL(host_efer);
196
197 bool __read_mostly allow_smaller_maxphyaddr = 0;
198 EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
199
200 static u64 __read_mostly host_xss;
201 u64 __read_mostly supported_xss;
202 EXPORT_SYMBOL_GPL(supported_xss);
203
204 struct kvm_stats_debugfs_item debugfs_entries[] = {
205         VCPU_STAT("pf_fixed", pf_fixed),
206         VCPU_STAT("pf_guest", pf_guest),
207         VCPU_STAT("tlb_flush", tlb_flush),
208         VCPU_STAT("invlpg", invlpg),
209         VCPU_STAT("exits", exits),
210         VCPU_STAT("io_exits", io_exits),
211         VCPU_STAT("mmio_exits", mmio_exits),
212         VCPU_STAT("signal_exits", signal_exits),
213         VCPU_STAT("irq_window", irq_window_exits),
214         VCPU_STAT("nmi_window", nmi_window_exits),
215         VCPU_STAT("halt_exits", halt_exits),
216         VCPU_STAT("halt_successful_poll", halt_successful_poll),
217         VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
218         VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
219         VCPU_STAT("halt_wakeup", halt_wakeup),
220         VCPU_STAT("hypercalls", hypercalls),
221         VCPU_STAT("request_irq", request_irq_exits),
222         VCPU_STAT("irq_exits", irq_exits),
223         VCPU_STAT("host_state_reload", host_state_reload),
224         VCPU_STAT("fpu_reload", fpu_reload),
225         VCPU_STAT("insn_emulation", insn_emulation),
226         VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
227         VCPU_STAT("irq_injections", irq_injections),
228         VCPU_STAT("nmi_injections", nmi_injections),
229         VCPU_STAT("req_event", req_event),
230         VCPU_STAT("l1d_flush", l1d_flush),
231         VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
232         VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
233         VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
234         VM_STAT("mmu_pte_write", mmu_pte_write),
235         VM_STAT("mmu_pte_updated", mmu_pte_updated),
236         VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
237         VM_STAT("mmu_flooded", mmu_flooded),
238         VM_STAT("mmu_recycled", mmu_recycled),
239         VM_STAT("mmu_cache_miss", mmu_cache_miss),
240         VM_STAT("mmu_unsync", mmu_unsync),
241         VM_STAT("remote_tlb_flush", remote_tlb_flush),
242         VM_STAT("largepages", lpages, .mode = 0444),
243         VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
244         VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
245         { NULL }
246 };
247
248 u64 __read_mostly host_xcr0;
249 u64 __read_mostly supported_xcr0;
250 EXPORT_SYMBOL_GPL(supported_xcr0);
251
252 static struct kmem_cache *x86_fpu_cache;
253
254 static struct kmem_cache *x86_emulator_cache;
255
256 /*
257  * When called, it means the previous get/set msr reached an invalid msr.
258  * Return true if we want to ignore/silent this failed msr access.
259  */
260 static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
261                                   u64 data, bool write)
262 {
263         const char *op = write ? "wrmsr" : "rdmsr";
264
265         if (ignore_msrs) {
266                 if (report_ignored_msrs)
267                         kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
268                                       op, msr, data);
269                 /* Mask the error */
270                 return true;
271         } else {
272                 kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
273                                       op, msr, data);
274                 return false;
275         }
276 }
277
278 static struct kmem_cache *kvm_alloc_emulator_cache(void)
279 {
280         unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
281         unsigned int size = sizeof(struct x86_emulate_ctxt);
282
283         return kmem_cache_create_usercopy("x86_emulator", size,
284                                           __alignof__(struct x86_emulate_ctxt),
285                                           SLAB_ACCOUNT, useroffset,
286                                           size - useroffset, NULL);
287 }
288
289 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
290
291 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
292 {
293         int i;
294         for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
295                 vcpu->arch.apf.gfns[i] = ~0;
296 }
297
298 static void kvm_on_user_return(struct user_return_notifier *urn)
299 {
300         unsigned slot;
301         struct kvm_user_return_msrs *msrs
302                 = container_of(urn, struct kvm_user_return_msrs, urn);
303         struct kvm_user_return_msr_values *values;
304         unsigned long flags;
305
306         /*
307          * Disabling irqs at this point since the following code could be
308          * interrupted and executed through kvm_arch_hardware_disable()
309          */
310         local_irq_save(flags);
311         if (msrs->registered) {
312                 msrs->registered = false;
313                 user_return_notifier_unregister(urn);
314         }
315         local_irq_restore(flags);
316         for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
317                 values = &msrs->values[slot];
318                 if (values->host != values->curr) {
319                         wrmsrl(user_return_msrs_global.msrs[slot], values->host);
320                         values->curr = values->host;
321                 }
322         }
323 }
324
325 void kvm_define_user_return_msr(unsigned slot, u32 msr)
326 {
327         BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
328         user_return_msrs_global.msrs[slot] = msr;
329         if (slot >= user_return_msrs_global.nr)
330                 user_return_msrs_global.nr = slot + 1;
331 }
332 EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
333
334 static void kvm_user_return_msr_cpu_online(void)
335 {
336         unsigned int cpu = smp_processor_id();
337         struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
338         u64 value;
339         int i;
340
341         for (i = 0; i < user_return_msrs_global.nr; ++i) {
342                 rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
343                 msrs->values[i].host = value;
344                 msrs->values[i].curr = value;
345         }
346 }
347
348 int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
349 {
350         unsigned int cpu = smp_processor_id();
351         struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
352         int err;
353
354         value = (value & mask) | (msrs->values[slot].host & ~mask);
355         if (value == msrs->values[slot].curr)
356                 return 0;
357         err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
358         if (err)
359                 return 1;
360
361         msrs->values[slot].curr = value;
362         if (!msrs->registered) {
363                 msrs->urn.on_user_return = kvm_on_user_return;
364                 user_return_notifier_register(&msrs->urn);
365                 msrs->registered = true;
366         }
367         return 0;
368 }
369 EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
370
371 static void drop_user_return_notifiers(void)
372 {
373         unsigned int cpu = smp_processor_id();
374         struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
375
376         if (msrs->registered)
377                 kvm_on_user_return(&msrs->urn);
378 }
379
380 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
381 {
382         return vcpu->arch.apic_base;
383 }
384 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
385
386 enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
387 {
388         return kvm_apic_mode(kvm_get_apic_base(vcpu));
389 }
390 EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
391
392 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
393 {
394         enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
395         enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
396         u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
397                 (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
398
399         if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
400                 return 1;
401         if (!msr_info->host_initiated) {
402                 if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
403                         return 1;
404                 if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
405                         return 1;
406         }
407
408         kvm_lapic_set_base(vcpu, msr_info->data);
409         kvm_recalculate_apic_map(vcpu->kvm);
410         return 0;
411 }
412 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
413
414 asmlinkage __visible noinstr void kvm_spurious_fault(void)
415 {
416         /* Fault while not rebooting.  We want the trace. */
417         BUG_ON(!kvm_rebooting);
418 }
419 EXPORT_SYMBOL_GPL(kvm_spurious_fault);
420
421 #define EXCPT_BENIGN            0
422 #define EXCPT_CONTRIBUTORY      1
423 #define EXCPT_PF                2
424
425 static int exception_class(int vector)
426 {
427         switch (vector) {
428         case PF_VECTOR:
429                 return EXCPT_PF;
430         case DE_VECTOR:
431         case TS_VECTOR:
432         case NP_VECTOR:
433         case SS_VECTOR:
434         case GP_VECTOR:
435                 return EXCPT_CONTRIBUTORY;
436         default:
437                 break;
438         }
439         return EXCPT_BENIGN;
440 }
441
442 #define EXCPT_FAULT             0
443 #define EXCPT_TRAP              1
444 #define EXCPT_ABORT             2
445 #define EXCPT_INTERRUPT         3
446
447 static int exception_type(int vector)
448 {
449         unsigned int mask;
450
451         if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
452                 return EXCPT_INTERRUPT;
453
454         mask = 1 << vector;
455
456         /* #DB is trap, as instruction watchpoints are handled elsewhere */
457         if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
458                 return EXCPT_TRAP;
459
460         if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
461                 return EXCPT_ABORT;
462
463         /* Reserved exceptions will result in fault */
464         return EXCPT_FAULT;
465 }
466
467 void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
468 {
469         unsigned nr = vcpu->arch.exception.nr;
470         bool has_payload = vcpu->arch.exception.has_payload;
471         unsigned long payload = vcpu->arch.exception.payload;
472
473         if (!has_payload)
474                 return;
475
476         switch (nr) {
477         case DB_VECTOR:
478                 /*
479                  * "Certain debug exceptions may clear bit 0-3.  The
480                  * remaining contents of the DR6 register are never
481                  * cleared by the processor".
482                  */
483                 vcpu->arch.dr6 &= ~DR_TRAP_BITS;
484                 /*
485                  * DR6.RTM is set by all #DB exceptions that don't clear it.
486                  */
487                 vcpu->arch.dr6 |= DR6_RTM;
488                 vcpu->arch.dr6 |= payload;
489                 /*
490                  * Bit 16 should be set in the payload whenever the #DB
491                  * exception should clear DR6.RTM. This makes the payload
492                  * compatible with the pending debug exceptions under VMX.
493                  * Though not currently documented in the SDM, this also
494                  * makes the payload compatible with the exit qualification
495                  * for #DB exceptions under VMX.
496                  */
497                 vcpu->arch.dr6 ^= payload & DR6_RTM;
498
499                 /*
500                  * The #DB payload is defined as compatible with the 'pending
501                  * debug exceptions' field under VMX, not DR6. While bit 12 is
502                  * defined in the 'pending debug exceptions' field (enabled
503                  * breakpoint), it is reserved and must be zero in DR6.
504                  */
505                 vcpu->arch.dr6 &= ~BIT(12);
506                 break;
507         case PF_VECTOR:
508                 vcpu->arch.cr2 = payload;
509                 break;
510         }
511
512         vcpu->arch.exception.has_payload = false;
513         vcpu->arch.exception.payload = 0;
514 }
515 EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
516
517 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
518                 unsigned nr, bool has_error, u32 error_code,
519                 bool has_payload, unsigned long payload, bool reinject)
520 {
521         u32 prev_nr;
522         int class1, class2;
523
524         kvm_make_request(KVM_REQ_EVENT, vcpu);
525
526         if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
527         queue:
528                 if (has_error && !is_protmode(vcpu))
529                         has_error = false;
530                 if (reinject) {
531                         /*
532                          * On vmentry, vcpu->arch.exception.pending is only
533                          * true if an event injection was blocked by
534                          * nested_run_pending.  In that case, however,
535                          * vcpu_enter_guest requests an immediate exit,
536                          * and the guest shouldn't proceed far enough to
537                          * need reinjection.
538                          */
539                         WARN_ON_ONCE(vcpu->arch.exception.pending);
540                         vcpu->arch.exception.injected = true;
541                         if (WARN_ON_ONCE(has_payload)) {
542                                 /*
543                                  * A reinjected event has already
544                                  * delivered its payload.
545                                  */
546                                 has_payload = false;
547                                 payload = 0;
548                         }
549                 } else {
550                         vcpu->arch.exception.pending = true;
551                         vcpu->arch.exception.injected = false;
552                 }
553                 vcpu->arch.exception.has_error_code = has_error;
554                 vcpu->arch.exception.nr = nr;
555                 vcpu->arch.exception.error_code = error_code;
556                 vcpu->arch.exception.has_payload = has_payload;
557                 vcpu->arch.exception.payload = payload;
558                 if (!is_guest_mode(vcpu))
559                         kvm_deliver_exception_payload(vcpu);
560                 return;
561         }
562
563         /* to check exception */
564         prev_nr = vcpu->arch.exception.nr;
565         if (prev_nr == DF_VECTOR) {
566                 /* triple fault -> shutdown */
567                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
568                 return;
569         }
570         class1 = exception_class(prev_nr);
571         class2 = exception_class(nr);
572         if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
573                 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
574                 /*
575                  * Generate double fault per SDM Table 5-5.  Set
576                  * exception.pending = true so that the double fault
577                  * can trigger a nested vmexit.
578                  */
579                 vcpu->arch.exception.pending = true;
580                 vcpu->arch.exception.injected = false;
581                 vcpu->arch.exception.has_error_code = true;
582                 vcpu->arch.exception.nr = DF_VECTOR;
583                 vcpu->arch.exception.error_code = 0;
584                 vcpu->arch.exception.has_payload = false;
585                 vcpu->arch.exception.payload = 0;
586         } else
587                 /* replace previous exception with a new one in a hope
588                    that instruction re-execution will regenerate lost
589                    exception */
590                 goto queue;
591 }
592
593 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
594 {
595         kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
596 }
597 EXPORT_SYMBOL_GPL(kvm_queue_exception);
598
599 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
600 {
601         kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
602 }
603 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
604
605 void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
606                            unsigned long payload)
607 {
608         kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
609 }
610 EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
611
612 static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
613                                     u32 error_code, unsigned long payload)
614 {
615         kvm_multiple_exception(vcpu, nr, true, error_code,
616                                true, payload, false);
617 }
618
619 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
620 {
621         if (err)
622                 kvm_inject_gp(vcpu, 0);
623         else
624                 return kvm_skip_emulated_instruction(vcpu);
625
626         return 1;
627 }
628 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
629
630 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
631 {
632         ++vcpu->stat.pf_guest;
633         vcpu->arch.exception.nested_apf =
634                 is_guest_mode(vcpu) && fault->async_page_fault;
635         if (vcpu->arch.exception.nested_apf) {
636                 vcpu->arch.apf.nested_apf_token = fault->address;
637                 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
638         } else {
639                 kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
640                                         fault->address);
641         }
642 }
643 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
644
645 bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
646                                     struct x86_exception *fault)
647 {
648         struct kvm_mmu *fault_mmu;
649         WARN_ON_ONCE(fault->vector != PF_VECTOR);
650
651         fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
652                                                vcpu->arch.walk_mmu;
653
654         /*
655          * Invalidate the TLB entry for the faulting address, if it exists,
656          * else the access will fault indefinitely (and to emulate hardware).
657          */
658         if ((fault->error_code & PFERR_PRESENT_MASK) &&
659             !(fault->error_code & PFERR_RSVD_MASK))
660                 kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
661                                        fault_mmu->root_hpa);
662
663         fault_mmu->inject_page_fault(vcpu, fault);
664         return fault->nested_page_fault;
665 }
666 EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
667
668 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
669 {
670         atomic_inc(&vcpu->arch.nmi_queued);
671         kvm_make_request(KVM_REQ_NMI, vcpu);
672 }
673 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
674
675 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
676 {
677         kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
678 }
679 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
680
681 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
682 {
683         kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
684 }
685 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
686
687 /*
688  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
689  * a #GP and return false.
690  */
691 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
692 {
693         if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl)
694                 return true;
695         kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
696         return false;
697 }
698 EXPORT_SYMBOL_GPL(kvm_require_cpl);
699
700 bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
701 {
702         if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
703                 return true;
704
705         kvm_queue_exception(vcpu, UD_VECTOR);
706         return false;
707 }
708 EXPORT_SYMBOL_GPL(kvm_require_dr);
709
710 /*
711  * This function will be used to read from the physical memory of the currently
712  * running guest. The difference to kvm_vcpu_read_guest_page is that this function
713  * can read from guest physical or from the guest's guest physical memory.
714  */
715 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
716                             gfn_t ngfn, void *data, int offset, int len,
717                             u32 access)
718 {
719         struct x86_exception exception;
720         gfn_t real_gfn;
721         gpa_t ngpa;
722
723         ngpa     = gfn_to_gpa(ngfn);
724         real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
725         if (real_gfn == UNMAPPED_GVA)
726                 return -EFAULT;
727
728         real_gfn = gpa_to_gfn(real_gfn);
729
730         return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
731 }
732 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
733
734 static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
735                                void *data, int offset, int len, u32 access)
736 {
737         return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
738                                        data, offset, len, access);
739 }
740
741 static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
742 {
743         return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) |
744                rsvd_bits(1, 2);
745 }
746
747 /*
748  * Load the pae pdptrs.  Return 1 if they are all valid, 0 otherwise.
749  */
750 int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
751 {
752         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
753         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
754         int i;
755         int ret;
756         u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
757
758         ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
759                                       offset * sizeof(u64), sizeof(pdpte),
760                                       PFERR_USER_MASK|PFERR_WRITE_MASK);
761         if (ret < 0) {
762                 ret = 0;
763                 goto out;
764         }
765         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
766                 if ((pdpte[i] & PT_PRESENT_MASK) &&
767                     (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
768                         ret = 0;
769                         goto out;
770                 }
771         }
772         ret = 1;
773
774         memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
775         kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
776
777 out:
778
779         return ret;
780 }
781 EXPORT_SYMBOL_GPL(load_pdptrs);
782
783 bool pdptrs_changed(struct kvm_vcpu *vcpu)
784 {
785         u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
786         int offset;
787         gfn_t gfn;
788         int r;
789
790         if (!is_pae_paging(vcpu))
791                 return false;
792
793         if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
794                 return true;
795
796         gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
797         offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
798         r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
799                                        PFERR_USER_MASK | PFERR_WRITE_MASK);
800         if (r < 0)
801                 return true;
802
803         return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
804 }
805 EXPORT_SYMBOL_GPL(pdptrs_changed);
806
807 void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
808 {
809         unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
810
811         if ((cr0 ^ old_cr0) & X86_CR0_PG) {
812                 kvm_clear_async_pf_completion_queue(vcpu);
813                 kvm_async_pf_hash_reset(vcpu);
814         }
815
816         if ((cr0 ^ old_cr0) & update_bits)
817                 kvm_mmu_reset_context(vcpu);
818
819         if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
820             kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
821             !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
822                 kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
823 }
824 EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
825
826 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
827 {
828         unsigned long old_cr0 = kvm_read_cr0(vcpu);
829         unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
830
831         cr0 |= X86_CR0_ET;
832
833 #ifdef CONFIG_X86_64
834         if (cr0 & 0xffffffff00000000UL)
835                 return 1;
836 #endif
837
838         cr0 &= ~CR0_RESERVED_BITS;
839
840         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
841                 return 1;
842
843         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
844                 return 1;
845
846 #ifdef CONFIG_X86_64
847         if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
848             (cr0 & X86_CR0_PG)) {
849                 int cs_db, cs_l;
850
851                 if (!is_pae(vcpu))
852                         return 1;
853                 kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
854                 if (cs_l)
855                         return 1;
856         }
857 #endif
858         if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
859             is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
860             !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
861                 return 1;
862
863         if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
864                 return 1;
865
866         kvm_x86_ops.set_cr0(vcpu, cr0);
867
868         kvm_post_set_cr0(vcpu, old_cr0, cr0);
869
870         return 0;
871 }
872 EXPORT_SYMBOL_GPL(kvm_set_cr0);
873
874 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
875 {
876         (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
877 }
878 EXPORT_SYMBOL_GPL(kvm_lmsw);
879
880 void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
881 {
882         if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
883
884                 if (vcpu->arch.xcr0 != host_xcr0)
885                         xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
886
887                 if (vcpu->arch.xsaves_enabled &&
888                     vcpu->arch.ia32_xss != host_xss)
889                         wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
890         }
891
892         if (static_cpu_has(X86_FEATURE_PKU) &&
893             (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
894              (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
895             vcpu->arch.pkru != vcpu->arch.host_pkru)
896                 __write_pkru(vcpu->arch.pkru);
897 }
898 EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
899
900 void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
901 {
902         if (static_cpu_has(X86_FEATURE_PKU) &&
903             (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
904              (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
905                 vcpu->arch.pkru = rdpkru();
906                 if (vcpu->arch.pkru != vcpu->arch.host_pkru)
907                         __write_pkru(vcpu->arch.host_pkru);
908         }
909
910         if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
911
912                 if (vcpu->arch.xcr0 != host_xcr0)
913                         xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
914
915                 if (vcpu->arch.xsaves_enabled &&
916                     vcpu->arch.ia32_xss != host_xss)
917                         wrmsrl(MSR_IA32_XSS, host_xss);
918         }
919
920 }
921 EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
922
923 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
924 {
925         u64 xcr0 = xcr;
926         u64 old_xcr0 = vcpu->arch.xcr0;
927         u64 valid_bits;
928
929         /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
930         if (index != XCR_XFEATURE_ENABLED_MASK)
931                 return 1;
932         if (!(xcr0 & XFEATURE_MASK_FP))
933                 return 1;
934         if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
935                 return 1;
936
937         /*
938          * Do not allow the guest to set bits that we do not support
939          * saving.  However, xcr0 bit 0 is always set, even if the
940          * emulated CPU does not support XSAVE (see fx_init).
941          */
942         valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
943         if (xcr0 & ~valid_bits)
944                 return 1;
945
946         if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
947             (!(xcr0 & XFEATURE_MASK_BNDCSR)))
948                 return 1;
949
950         if (xcr0 & XFEATURE_MASK_AVX512) {
951                 if (!(xcr0 & XFEATURE_MASK_YMM))
952                         return 1;
953                 if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
954                         return 1;
955         }
956         vcpu->arch.xcr0 = xcr0;
957
958         if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
959                 kvm_update_cpuid_runtime(vcpu);
960         return 0;
961 }
962
963 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
964 {
965         if (kvm_x86_ops.get_cpl(vcpu) != 0 ||
966             __kvm_set_xcr(vcpu, index, xcr)) {
967                 kvm_inject_gp(vcpu, 0);
968                 return 1;
969         }
970         return 0;
971 }
972 EXPORT_SYMBOL_GPL(kvm_set_xcr);
973
974 bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
975 {
976         if (cr4 & cr4_reserved_bits)
977                 return false;
978
979         if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
980                 return false;
981
982         return kvm_x86_ops.is_valid_cr4(vcpu, cr4);
983 }
984 EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
985
986 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
987 {
988         unsigned long old_cr4 = kvm_read_cr4(vcpu);
989         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
990                                    X86_CR4_SMEP;
991         unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE;
992
993         if (!kvm_is_valid_cr4(vcpu, cr4))
994                 return 1;
995
996         if (is_long_mode(vcpu)) {
997                 if (!(cr4 & X86_CR4_PAE))
998                         return 1;
999                 if ((cr4 ^ old_cr4) & X86_CR4_LA57)
1000                         return 1;
1001         } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
1002                    && ((cr4 ^ old_cr4) & pdptr_bits)
1003                    && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
1004                                    kvm_read_cr3(vcpu)))
1005                 return 1;
1006
1007         if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
1008                 if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
1009                         return 1;
1010
1011                 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
1012                 if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
1013                         return 1;
1014         }
1015
1016         kvm_x86_ops.set_cr4(vcpu, cr4);
1017
1018         if (((cr4 ^ old_cr4) & mmu_role_bits) ||
1019             (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
1020                 kvm_mmu_reset_context(vcpu);
1021
1022         return 0;
1023 }
1024 EXPORT_SYMBOL_GPL(kvm_set_cr4);
1025
1026 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1027 {
1028         bool skip_tlb_flush = false;
1029 #ifdef CONFIG_X86_64
1030         bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
1031
1032         if (pcid_enabled) {
1033                 skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
1034                 cr3 &= ~X86_CR3_PCID_NOFLUSH;
1035         }
1036 #endif
1037
1038         if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
1039                 if (!skip_tlb_flush) {
1040                         kvm_mmu_sync_roots(vcpu);
1041                         kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1042                 }
1043                 return 0;
1044         }
1045
1046         if (is_long_mode(vcpu) &&
1047             (cr3 & vcpu->arch.cr3_lm_rsvd_bits))
1048                 return 1;
1049         else if (is_pae_paging(vcpu) &&
1050                  !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
1051                 return 1;
1052
1053         kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
1054         vcpu->arch.cr3 = cr3;
1055         kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
1056
1057         return 0;
1058 }
1059 EXPORT_SYMBOL_GPL(kvm_set_cr3);
1060
1061 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
1062 {
1063         if (cr8 & CR8_RESERVED_BITS)
1064                 return 1;
1065         if (lapic_in_kernel(vcpu))
1066                 kvm_lapic_set_tpr(vcpu, cr8);
1067         else
1068                 vcpu->arch.cr8 = cr8;
1069         return 0;
1070 }
1071 EXPORT_SYMBOL_GPL(kvm_set_cr8);
1072
1073 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
1074 {
1075         if (lapic_in_kernel(vcpu))
1076                 return kvm_lapic_get_cr8(vcpu);
1077         else
1078                 return vcpu->arch.cr8;
1079 }
1080 EXPORT_SYMBOL_GPL(kvm_get_cr8);
1081
1082 static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
1083 {
1084         int i;
1085
1086         if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1087                 for (i = 0; i < KVM_NR_DB_REGS; i++)
1088                         vcpu->arch.eff_db[i] = vcpu->arch.db[i];
1089                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
1090         }
1091 }
1092
1093 void kvm_update_dr7(struct kvm_vcpu *vcpu)
1094 {
1095         unsigned long dr7;
1096
1097         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1098                 dr7 = vcpu->arch.guest_debug_dr7;
1099         else
1100                 dr7 = vcpu->arch.dr7;
1101         kvm_x86_ops.set_dr7(vcpu, dr7);
1102         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
1103         if (dr7 & DR7_BP_EN_MASK)
1104                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
1105 }
1106 EXPORT_SYMBOL_GPL(kvm_update_dr7);
1107
1108 static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
1109 {
1110         u64 fixed = DR6_FIXED_1;
1111
1112         if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
1113                 fixed |= DR6_RTM;
1114         return fixed;
1115 }
1116
1117 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
1118 {
1119         size_t size = ARRAY_SIZE(vcpu->arch.db);
1120
1121         switch (dr) {
1122         case 0 ... 3:
1123                 vcpu->arch.db[array_index_nospec(dr, size)] = val;
1124                 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1125                         vcpu->arch.eff_db[dr] = val;
1126                 break;
1127         case 4:
1128         case 6:
1129                 if (!kvm_dr6_valid(val))
1130                         return -1; /* #GP */
1131                 vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
1132                 break;
1133         case 5:
1134         default: /* 7 */
1135                 if (!kvm_dr7_valid(val))
1136                         return -1; /* #GP */
1137                 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
1138                 kvm_update_dr7(vcpu);
1139                 break;
1140         }
1141
1142         return 0;
1143 }
1144
1145 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
1146 {
1147         if (__kvm_set_dr(vcpu, dr, val)) {
1148                 kvm_inject_gp(vcpu, 0);
1149                 return 1;
1150         }
1151         return 0;
1152 }
1153 EXPORT_SYMBOL_GPL(kvm_set_dr);
1154
1155 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
1156 {
1157         size_t size = ARRAY_SIZE(vcpu->arch.db);
1158
1159         switch (dr) {
1160         case 0 ... 3:
1161                 *val = vcpu->arch.db[array_index_nospec(dr, size)];
1162                 break;
1163         case 4:
1164         case 6:
1165                 *val = vcpu->arch.dr6;
1166                 break;
1167         case 5:
1168         default: /* 7 */
1169                 *val = vcpu->arch.dr7;
1170                 break;
1171         }
1172         return 0;
1173 }
1174 EXPORT_SYMBOL_GPL(kvm_get_dr);
1175
1176 bool kvm_rdpmc(struct kvm_vcpu *vcpu)
1177 {
1178         u32 ecx = kvm_rcx_read(vcpu);
1179         u64 data;
1180         int err;
1181
1182         err = kvm_pmu_rdpmc(vcpu, ecx, &data);
1183         if (err)
1184                 return err;
1185         kvm_rax_write(vcpu, (u32)data);
1186         kvm_rdx_write(vcpu, data >> 32);
1187         return err;
1188 }
1189 EXPORT_SYMBOL_GPL(kvm_rdpmc);
1190
1191 /*
1192  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1193  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1194  *
1195  * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
1196  * extract the supported MSRs from the related const lists.
1197  * msrs_to_save is selected from the msrs_to_save_all to reflect the
1198  * capabilities of the host cpu. This capabilities test skips MSRs that are
1199  * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
1200  * may depend on host virtualization features rather than host cpu features.
1201  */
1202
1203 static const u32 msrs_to_save_all[] = {
1204         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1205         MSR_STAR,
1206 #ifdef CONFIG_X86_64
1207         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1208 #endif
1209         MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1210         MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1211         MSR_IA32_SPEC_CTRL,
1212         MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
1213         MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
1214         MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
1215         MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
1216         MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
1217         MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
1218         MSR_IA32_UMWAIT_CONTROL,
1219
1220         MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
1221         MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
1222         MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
1223         MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
1224         MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
1225         MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
1226         MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
1227         MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
1228         MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
1229         MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
1230         MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
1231         MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
1232         MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
1233         MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
1234         MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
1235         MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
1236         MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
1237         MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
1238         MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
1239         MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
1240         MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
1241         MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
1242 };
1243
1244 static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
1245 static unsigned num_msrs_to_save;
1246
1247 static const u32 emulated_msrs_all[] = {
1248         MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
1249         MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
1250         HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
1251         HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
1252         HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
1253         HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
1254         HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
1255         HV_X64_MSR_RESET,
1256         HV_X64_MSR_VP_INDEX,
1257         HV_X64_MSR_VP_RUNTIME,
1258         HV_X64_MSR_SCONTROL,
1259         HV_X64_MSR_STIMER0_CONFIG,
1260         HV_X64_MSR_VP_ASSIST_PAGE,
1261         HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
1262         HV_X64_MSR_TSC_EMULATION_STATUS,
1263         HV_X64_MSR_SYNDBG_OPTIONS,
1264         HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
1265         HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
1266         HV_X64_MSR_SYNDBG_PENDING_BUFFER,
1267
1268         MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1269         MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
1270
1271         MSR_IA32_TSC_ADJUST,
1272         MSR_IA32_TSCDEADLINE,
1273         MSR_IA32_ARCH_CAPABILITIES,
1274         MSR_IA32_PERF_CAPABILITIES,
1275         MSR_IA32_MISC_ENABLE,
1276         MSR_IA32_MCG_STATUS,
1277         MSR_IA32_MCG_CTL,
1278         MSR_IA32_MCG_EXT_CTL,
1279         MSR_IA32_SMBASE,
1280         MSR_SMI_COUNT,
1281         MSR_PLATFORM_INFO,
1282         MSR_MISC_FEATURES_ENABLES,
1283         MSR_AMD64_VIRT_SPEC_CTRL,
1284         MSR_IA32_POWER_CTL,
1285         MSR_IA32_UCODE_REV,
1286
1287         /*
1288          * The following list leaves out MSRs whose values are determined
1289          * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
1290          * We always support the "true" VMX control MSRs, even if the host
1291          * processor does not, so I am putting these registers here rather
1292          * than in msrs_to_save_all.
1293          */
1294         MSR_IA32_VMX_BASIC,
1295         MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1296         MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1297         MSR_IA32_VMX_TRUE_EXIT_CTLS,
1298         MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1299         MSR_IA32_VMX_MISC,
1300         MSR_IA32_VMX_CR0_FIXED0,
1301         MSR_IA32_VMX_CR4_FIXED0,
1302         MSR_IA32_VMX_VMCS_ENUM,
1303         MSR_IA32_VMX_PROCBASED_CTLS2,
1304         MSR_IA32_VMX_EPT_VPID_CAP,
1305         MSR_IA32_VMX_VMFUNC,
1306
1307         MSR_K7_HWCR,
1308         MSR_KVM_POLL_CONTROL,
1309 };
1310
1311 static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
1312 static unsigned num_emulated_msrs;
1313
1314 /*
1315  * List of msr numbers which are used to expose MSR-based features that
1316  * can be used by a hypervisor to validate requested CPU features.
1317  */
1318 static const u32 msr_based_features_all[] = {
1319         MSR_IA32_VMX_BASIC,
1320         MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1321         MSR_IA32_VMX_PINBASED_CTLS,
1322         MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1323         MSR_IA32_VMX_PROCBASED_CTLS,
1324         MSR_IA32_VMX_TRUE_EXIT_CTLS,
1325         MSR_IA32_VMX_EXIT_CTLS,
1326         MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1327         MSR_IA32_VMX_ENTRY_CTLS,
1328         MSR_IA32_VMX_MISC,
1329         MSR_IA32_VMX_CR0_FIXED0,
1330         MSR_IA32_VMX_CR0_FIXED1,
1331         MSR_IA32_VMX_CR4_FIXED0,
1332         MSR_IA32_VMX_CR4_FIXED1,
1333         MSR_IA32_VMX_VMCS_ENUM,
1334         MSR_IA32_VMX_PROCBASED_CTLS2,
1335         MSR_IA32_VMX_EPT_VPID_CAP,
1336         MSR_IA32_VMX_VMFUNC,
1337
1338         MSR_F10H_DECFG,
1339         MSR_IA32_UCODE_REV,
1340         MSR_IA32_ARCH_CAPABILITIES,
1341         MSR_IA32_PERF_CAPABILITIES,
1342 };
1343
1344 static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
1345 static unsigned int num_msr_based_features;
1346
1347 static u64 kvm_get_arch_capabilities(void)
1348 {
1349         u64 data = 0;
1350
1351         if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
1352                 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
1353
1354         /*
1355          * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
1356          * the nested hypervisor runs with NX huge pages.  If it is not,
1357          * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
1358          * L1 guests, so it need not worry about its own (L2) guests.
1359          */
1360         data |= ARCH_CAP_PSCHANGE_MC_NO;
1361
1362         /*
1363          * If we're doing cache flushes (either "always" or "cond")
1364          * we will do one whenever the guest does a vmlaunch/vmresume.
1365          * If an outer hypervisor is doing the cache flush for us
1366          * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
1367          * capability to the guest too, and if EPT is disabled we're not
1368          * vulnerable.  Overall, only VMENTER_L1D_FLUSH_NEVER will
1369          * require a nested hypervisor to do a flush of its own.
1370          */
1371         if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
1372                 data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
1373
1374         if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
1375                 data |= ARCH_CAP_RDCL_NO;
1376         if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
1377                 data |= ARCH_CAP_SSB_NO;
1378         if (!boot_cpu_has_bug(X86_BUG_MDS))
1379                 data |= ARCH_CAP_MDS_NO;
1380
1381         /*
1382          * On TAA affected systems:
1383          *      - nothing to do if TSX is disabled on the host.
1384          *      - we emulate TSX_CTRL if present on the host.
1385          *        This lets the guest use VERW to clear CPU buffers.
1386          */
1387         if (!boot_cpu_has(X86_FEATURE_RTM))
1388                 data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR);
1389         else if (!boot_cpu_has_bug(X86_BUG_TAA))
1390                 data |= ARCH_CAP_TAA_NO;
1391
1392         return data;
1393 }
1394
1395 static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1396 {
1397         switch (msr->index) {
1398         case MSR_IA32_ARCH_CAPABILITIES:
1399                 msr->data = kvm_get_arch_capabilities();
1400                 break;
1401         case MSR_IA32_UCODE_REV:
1402                 rdmsrl_safe(msr->index, &msr->data);
1403                 break;
1404         default:
1405                 return kvm_x86_ops.get_msr_feature(msr);
1406         }
1407         return 0;
1408 }
1409
1410 static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1411 {
1412         struct kvm_msr_entry msr;
1413         int r;
1414
1415         msr.index = index;
1416         r = kvm_get_msr_feature(&msr);
1417
1418         if (r == KVM_MSR_RET_INVALID) {
1419                 /* Unconditionally clear the output for simplicity */
1420                 *data = 0;
1421                 if (kvm_msr_ignored_check(vcpu, index, 0, false))
1422                         r = 0;
1423         }
1424
1425         if (r)
1426                 return r;
1427
1428         *data = msr.data;
1429
1430         return 0;
1431 }
1432
1433 static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1434 {
1435         if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
1436                 return false;
1437
1438         if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
1439                 return false;
1440
1441         if (efer & (EFER_LME | EFER_LMA) &&
1442             !guest_cpuid_has(vcpu, X86_FEATURE_LM))
1443                 return false;
1444
1445         if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
1446                 return false;
1447
1448         return true;
1449
1450 }
1451 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1452 {
1453         if (efer & efer_reserved_bits)
1454                 return false;
1455
1456         return __kvm_valid_efer(vcpu, efer);
1457 }
1458 EXPORT_SYMBOL_GPL(kvm_valid_efer);
1459
1460 static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1461 {
1462         u64 old_efer = vcpu->arch.efer;
1463         u64 efer = msr_info->data;
1464         int r;
1465
1466         if (efer & efer_reserved_bits)
1467                 return 1;
1468
1469         if (!msr_info->host_initiated) {
1470                 if (!__kvm_valid_efer(vcpu, efer))
1471                         return 1;
1472
1473                 if (is_paging(vcpu) &&
1474                     (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1475                         return 1;
1476         }
1477
1478         efer &= ~EFER_LMA;
1479         efer |= vcpu->arch.efer & EFER_LMA;
1480
1481         r = kvm_x86_ops.set_efer(vcpu, efer);
1482         if (r) {
1483                 WARN_ON(r > 0);
1484                 return r;
1485         }
1486
1487         /* Update reserved bits */
1488         if ((efer ^ old_efer) & EFER_NX)
1489                 kvm_mmu_reset_context(vcpu);
1490
1491         return 0;
1492 }
1493
1494 void kvm_enable_efer_bits(u64 mask)
1495 {
1496        efer_reserved_bits &= ~mask;
1497 }
1498 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
1499
1500 bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
1501 {
1502         struct kvm *kvm = vcpu->kvm;
1503         struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
1504         u32 count = kvm->arch.msr_filter.count;
1505         u32 i;
1506         bool r = kvm->arch.msr_filter.default_allow;
1507         int idx;
1508
1509         /* MSR filtering not set up or x2APIC enabled, allow everything */
1510         if (!count || (index >= 0x800 && index <= 0x8ff))
1511                 return true;
1512
1513         /* Prevent collision with set_msr_filter */
1514         idx = srcu_read_lock(&kvm->srcu);
1515
1516         for (i = 0; i < count; i++) {
1517                 u32 start = ranges[i].base;
1518                 u32 end = start + ranges[i].nmsrs;
1519                 u32 flags = ranges[i].flags;
1520                 unsigned long *bitmap = ranges[i].bitmap;
1521
1522                 if ((index >= start) && (index < end) && (flags & type)) {
1523                         r = !!test_bit(index - start, bitmap);
1524                         break;
1525                 }
1526         }
1527
1528         srcu_read_unlock(&kvm->srcu, idx);
1529
1530         return r;
1531 }
1532 EXPORT_SYMBOL_GPL(kvm_msr_allowed);
1533
1534 /*
1535  * Write @data into the MSR specified by @index.  Select MSR specific fault
1536  * checks are bypassed if @host_initiated is %true.
1537  * Returns 0 on success, non-0 otherwise.
1538  * Assumes vcpu_load() was already called.
1539  */
1540 static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
1541                          bool host_initiated)
1542 {
1543         struct msr_data msr;
1544
1545         if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
1546                 return KVM_MSR_RET_FILTERED;
1547
1548         switch (index) {
1549         case MSR_FS_BASE:
1550         case MSR_GS_BASE:
1551         case MSR_KERNEL_GS_BASE:
1552         case MSR_CSTAR:
1553         case MSR_LSTAR:
1554                 if (is_noncanonical_address(data, vcpu))
1555                         return 1;
1556                 break;
1557         case MSR_IA32_SYSENTER_EIP:
1558         case MSR_IA32_SYSENTER_ESP:
1559                 /*
1560                  * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
1561                  * non-canonical address is written on Intel but not on
1562                  * AMD (which ignores the top 32-bits, because it does
1563                  * not implement 64-bit SYSENTER).
1564                  *
1565                  * 64-bit code should hence be able to write a non-canonical
1566                  * value on AMD.  Making the address canonical ensures that
1567                  * vmentry does not fail on Intel after writing a non-canonical
1568                  * value, and that something deterministic happens if the guest
1569                  * invokes 64-bit SYSENTER.
1570                  */
1571                 data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
1572         }
1573
1574         msr.data = data;
1575         msr.index = index;
1576         msr.host_initiated = host_initiated;
1577
1578         return kvm_x86_ops.set_msr(vcpu, &msr);
1579 }
1580
1581 static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
1582                                      u32 index, u64 data, bool host_initiated)
1583 {
1584         int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
1585
1586         if (ret == KVM_MSR_RET_INVALID)
1587                 if (kvm_msr_ignored_check(vcpu, index, data, true))
1588                         ret = 0;
1589
1590         return ret;
1591 }
1592
1593 /*
1594  * Read the MSR specified by @index into @data.  Select MSR specific fault
1595  * checks are bypassed if @host_initiated is %true.
1596  * Returns 0 on success, non-0 otherwise.
1597  * Assumes vcpu_load() was already called.
1598  */
1599 int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
1600                   bool host_initiated)
1601 {
1602         struct msr_data msr;
1603         int ret;
1604
1605         if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
1606                 return KVM_MSR_RET_FILTERED;
1607
1608         msr.index = index;
1609         msr.host_initiated = host_initiated;
1610
1611         ret = kvm_x86_ops.get_msr(vcpu, &msr);
1612         if (!ret)
1613                 *data = msr.data;
1614         return ret;
1615 }
1616
1617 static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
1618                                      u32 index, u64 *data, bool host_initiated)
1619 {
1620         int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
1621
1622         if (ret == KVM_MSR_RET_INVALID) {
1623                 /* Unconditionally clear *data for simplicity */
1624                 *data = 0;
1625                 if (kvm_msr_ignored_check(vcpu, index, 0, false))
1626                         ret = 0;
1627         }
1628
1629         return ret;
1630 }
1631
1632 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
1633 {
1634         return kvm_get_msr_ignored_check(vcpu, index, data, false);
1635 }
1636 EXPORT_SYMBOL_GPL(kvm_get_msr);
1637
1638 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
1639 {
1640         return kvm_set_msr_ignored_check(vcpu, index, data, false);
1641 }
1642 EXPORT_SYMBOL_GPL(kvm_set_msr);
1643
1644 static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
1645 {
1646         int err = vcpu->run->msr.error;
1647         if (!err) {
1648                 kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
1649                 kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
1650         }
1651
1652         return kvm_x86_ops.complete_emulated_msr(vcpu, err);
1653 }
1654
1655 static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
1656 {
1657         return kvm_x86_ops.complete_emulated_msr(vcpu, vcpu->run->msr.error);
1658 }
1659
1660 static u64 kvm_msr_reason(int r)
1661 {
1662         switch (r) {
1663         case KVM_MSR_RET_INVALID:
1664                 return KVM_MSR_EXIT_REASON_UNKNOWN;
1665         case KVM_MSR_RET_FILTERED:
1666                 return KVM_MSR_EXIT_REASON_FILTER;
1667         default:
1668                 return KVM_MSR_EXIT_REASON_INVAL;
1669         }
1670 }
1671
1672 static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
1673                               u32 exit_reason, u64 data,
1674                               int (*completion)(struct kvm_vcpu *vcpu),
1675                               int r)
1676 {
1677         u64 msr_reason = kvm_msr_reason(r);
1678
1679         /* Check if the user wanted to know about this MSR fault */
1680         if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
1681                 return 0;
1682
1683         vcpu->run->exit_reason = exit_reason;
1684         vcpu->run->msr.error = 0;
1685         memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
1686         vcpu->run->msr.reason = msr_reason;
1687         vcpu->run->msr.index = index;
1688         vcpu->run->msr.data = data;
1689         vcpu->arch.complete_userspace_io = completion;
1690
1691         return 1;
1692 }
1693
1694 static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
1695 {
1696         return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
1697                                    complete_emulated_rdmsr, r);
1698 }
1699
1700 static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
1701 {
1702         return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
1703                                    complete_emulated_wrmsr, r);
1704 }
1705
1706 int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
1707 {
1708         u32 ecx = kvm_rcx_read(vcpu);
1709         u64 data;
1710         int r;
1711
1712         r = kvm_get_msr(vcpu, ecx, &data);
1713
1714         /* MSR read failed? See if we should ask user space */
1715         if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
1716                 /* Bounce to user space */
1717                 return 0;
1718         }
1719
1720         if (!r) {
1721                 trace_kvm_msr_read(ecx, data);
1722
1723                 kvm_rax_write(vcpu, data & -1u);
1724                 kvm_rdx_write(vcpu, (data >> 32) & -1u);
1725         } else {
1726                 trace_kvm_msr_read_ex(ecx);
1727         }
1728
1729         return kvm_x86_ops.complete_emulated_msr(vcpu, r);
1730 }
1731 EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
1732
1733 int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
1734 {
1735         u32 ecx = kvm_rcx_read(vcpu);
1736         u64 data = kvm_read_edx_eax(vcpu);
1737         int r;
1738
1739         r = kvm_set_msr(vcpu, ecx, data);
1740
1741         /* MSR write failed? See if we should ask user space */
1742         if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
1743                 /* Bounce to user space */
1744                 return 0;
1745
1746         /* Signal all other negative errors to userspace */
1747         if (r < 0)
1748                 return r;
1749
1750         if (!r)
1751                 trace_kvm_msr_write(ecx, data);
1752         else
1753                 trace_kvm_msr_write_ex(ecx, data);
1754
1755         return kvm_x86_ops.complete_emulated_msr(vcpu, r);
1756 }
1757 EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
1758
1759 bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
1760 {
1761         return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
1762                 xfer_to_guest_mode_work_pending();
1763 }
1764 EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request);
1765
1766 /*
1767  * The fast path for frequent and performance sensitive wrmsr emulation,
1768  * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
1769  * the latency of virtual IPI by avoiding the expensive bits of transitioning
1770  * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
1771  * other cases which must be called after interrupts are enabled on the host.
1772  */
1773 static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
1774 {
1775         if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
1776                 return 1;
1777
1778         if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
1779                 ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
1780                 ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
1781                 ((u32)(data >> 32) != X2APIC_BROADCAST)) {
1782
1783                 data &= ~(1 << 12);
1784                 kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
1785                 kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
1786                 kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
1787                 trace_kvm_apic_write(APIC_ICR, (u32)data);
1788                 return 0;
1789         }
1790
1791         return 1;
1792 }
1793
1794 static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
1795 {
1796         if (!kvm_can_use_hv_timer(vcpu))
1797                 return 1;
1798
1799         kvm_set_lapic_tscdeadline_msr(vcpu, data);
1800         return 0;
1801 }
1802
1803 fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
1804 {
1805         u32 msr = kvm_rcx_read(vcpu);
1806         u64 data;
1807         fastpath_t ret = EXIT_FASTPATH_NONE;
1808
1809         switch (msr) {
1810         case APIC_BASE_MSR + (APIC_ICR >> 4):
1811                 data = kvm_read_edx_eax(vcpu);
1812                 if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
1813                         kvm_skip_emulated_instruction(vcpu);
1814                         ret = EXIT_FASTPATH_EXIT_HANDLED;
1815                 }
1816                 break;
1817         case MSR_IA32_TSCDEADLINE:
1818                 data = kvm_read_edx_eax(vcpu);
1819                 if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
1820                         kvm_skip_emulated_instruction(vcpu);
1821                         ret = EXIT_FASTPATH_REENTER_GUEST;
1822                 }
1823                 break;
1824         default:
1825                 break;
1826         }
1827
1828         if (ret != EXIT_FASTPATH_NONE)
1829                 trace_kvm_msr_write(msr, data);
1830
1831         return ret;
1832 }
1833 EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
1834
1835 /*
1836  * Adapt set_msr() to msr_io()'s calling convention
1837  */
1838 static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1839 {
1840         return kvm_get_msr_ignored_check(vcpu, index, data, true);
1841 }
1842
1843 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1844 {
1845         return kvm_set_msr_ignored_check(vcpu, index, *data, true);
1846 }
1847
1848 #ifdef CONFIG_X86_64
1849 struct pvclock_clock {
1850         int vclock_mode;
1851         u64 cycle_last;
1852         u64 mask;
1853         u32 mult;
1854         u32 shift;
1855         u64 base_cycles;
1856         u64 offset;
1857 };
1858
1859 struct pvclock_gtod_data {
1860         seqcount_t      seq;
1861
1862         struct pvclock_clock clock; /* extract of a clocksource struct */
1863         struct pvclock_clock raw_clock; /* extract of a clocksource struct */
1864
1865         ktime_t         offs_boot;
1866         u64             wall_time_sec;
1867 };
1868
1869 static struct pvclock_gtod_data pvclock_gtod_data;
1870
1871 static void update_pvclock_gtod(struct timekeeper *tk)
1872 {
1873         struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1874
1875         write_seqcount_begin(&vdata->seq);
1876
1877         /* copy pvclock gtod data */
1878         vdata->clock.vclock_mode        = tk->tkr_mono.clock->vdso_clock_mode;
1879         vdata->clock.cycle_last         = tk->tkr_mono.cycle_last;
1880         vdata->clock.mask               = tk->tkr_mono.mask;
1881         vdata->clock.mult               = tk->tkr_mono.mult;
1882         vdata->clock.shift              = tk->tkr_mono.shift;
1883         vdata->clock.base_cycles        = tk->tkr_mono.xtime_nsec;
1884         vdata->clock.offset             = tk->tkr_mono.base;
1885
1886         vdata->raw_clock.vclock_mode    = tk->tkr_raw.clock->vdso_clock_mode;
1887         vdata->raw_clock.cycle_last     = tk->tkr_raw.cycle_last;
1888         vdata->raw_clock.mask           = tk->tkr_raw.mask;
1889         vdata->raw_clock.mult           = tk->tkr_raw.mult;
1890         vdata->raw_clock.shift          = tk->tkr_raw.shift;
1891         vdata->raw_clock.base_cycles    = tk->tkr_raw.xtime_nsec;
1892         vdata->raw_clock.offset         = tk->tkr_raw.base;
1893
1894         vdata->wall_time_sec            = tk->xtime_sec;
1895
1896         vdata->offs_boot                = tk->offs_boot;
1897
1898         write_seqcount_end(&vdata->seq);
1899 }
1900
1901 static s64 get_kvmclock_base_ns(void)
1902 {
1903         /* Count up from boot time, but with the frequency of the raw clock.  */
1904         return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
1905 }
1906 #else
1907 static s64 get_kvmclock_base_ns(void)
1908 {
1909         /* Master clock not used, so we can just use CLOCK_BOOTTIME.  */
1910         return ktime_get_boottime_ns();
1911 }
1912 #endif
1913
1914 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
1915 {
1916         int version;
1917         int r;
1918         struct pvclock_wall_clock wc;
1919         u64 wall_nsec;
1920
1921         kvm->arch.wall_clock = wall_clock;
1922
1923         if (!wall_clock)
1924                 return;
1925
1926         r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
1927         if (r)
1928                 return;
1929
1930         if (version & 1)
1931                 ++version;  /* first time write, random junk */
1932
1933         ++version;
1934
1935         if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
1936                 return;
1937
1938         /*
1939          * The guest calculates current wall clock time by adding
1940          * system time (updated by kvm_guest_time_update below) to the
1941          * wall clock specified here.  We do the reverse here.
1942          */
1943         wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
1944
1945         wc.nsec = do_div(wall_nsec, 1000000000);
1946         wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
1947         wc.version = version;
1948
1949         kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
1950
1951         version++;
1952         kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
1953 }
1954
1955 static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
1956                                   bool old_msr, bool host_initiated)
1957 {
1958         struct kvm_arch *ka = &vcpu->kvm->arch;
1959
1960         if (vcpu->vcpu_id == 0 && !host_initiated) {
1961                 if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
1962                         kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
1963
1964                 ka->boot_vcpu_runs_old_kvmclock = old_msr;
1965         }
1966
1967         vcpu->arch.time = system_time;
1968         kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
1969
1970         /* we verify if the enable bit is set... */
1971         vcpu->arch.pv_time_enabled = false;
1972         if (!(system_time & 1))
1973                 return;
1974
1975         if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
1976                                        &vcpu->arch.pv_time, system_time & ~1ULL,
1977                                        sizeof(struct pvclock_vcpu_time_info)))
1978                 vcpu->arch.pv_time_enabled = true;
1979
1980         return;
1981 }
1982
1983 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
1984 {
1985         do_shl32_div32(dividend, divisor);
1986         return dividend;
1987 }
1988
1989 static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
1990                                s8 *pshift, u32 *pmultiplier)
1991 {
1992         uint64_t scaled64;
1993         int32_t  shift = 0;
1994         uint64_t tps64;
1995         uint32_t tps32;
1996
1997         tps64 = base_hz;
1998         scaled64 = scaled_hz;
1999         while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
2000                 tps64 >>= 1;
2001                 shift--;
2002         }
2003
2004         tps32 = (uint32_t)tps64;
2005         while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
2006                 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
2007                         scaled64 >>= 1;
2008                 else
2009                         tps32 <<= 1;
2010                 shift++;
2011         }
2012
2013         *pshift = shift;
2014         *pmultiplier = div_frac(scaled64, tps32);
2015 }
2016
2017 #ifdef CONFIG_X86_64
2018 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
2019 #endif
2020
2021 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
2022 static unsigned long max_tsc_khz;
2023
2024 static u32 adjust_tsc_khz(u32 khz, s32 ppm)
2025 {
2026         u64 v = (u64)khz * (1000000 + ppm);
2027         do_div(v, 1000000);
2028         return v;
2029 }
2030
2031 static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
2032 {
2033         u64 ratio;
2034
2035         /* Guest TSC same frequency as host TSC? */
2036         if (!scale) {
2037                 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
2038                 return 0;
2039         }
2040
2041         /* TSC scaling supported? */
2042         if (!kvm_has_tsc_control) {
2043                 if (user_tsc_khz > tsc_khz) {
2044                         vcpu->arch.tsc_catchup = 1;
2045                         vcpu->arch.tsc_always_catchup = 1;
2046                         return 0;
2047                 } else {
2048                         pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
2049                         return -1;
2050                 }
2051         }
2052
2053         /* TSC scaling required  - calculate ratio */
2054         ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
2055                                 user_tsc_khz, tsc_khz);
2056
2057         if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
2058                 pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
2059                                     user_tsc_khz);
2060                 return -1;
2061         }
2062
2063         vcpu->arch.tsc_scaling_ratio = ratio;
2064         return 0;
2065 }
2066
2067 static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
2068 {
2069         u32 thresh_lo, thresh_hi;
2070         int use_scaling = 0;
2071
2072         /* tsc_khz can be zero if TSC calibration fails */
2073         if (user_tsc_khz == 0) {
2074                 /* set tsc_scaling_ratio to a safe value */
2075                 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
2076                 return -1;
2077         }
2078
2079         /* Compute a scale to convert nanoseconds in TSC cycles */
2080         kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
2081                            &vcpu->arch.virtual_tsc_shift,
2082                            &vcpu->arch.virtual_tsc_mult);
2083         vcpu->arch.virtual_tsc_khz = user_tsc_khz;
2084
2085         /*
2086          * Compute the variation in TSC rate which is acceptable
2087          * within the range of tolerance and decide if the
2088          * rate being applied is within that bounds of the hardware
2089          * rate.  If so, no scaling or compensation need be done.
2090          */
2091         thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
2092         thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
2093         if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
2094                 pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
2095                 use_scaling = 1;
2096         }
2097         return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
2098 }
2099
2100 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
2101 {
2102         u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
2103                                       vcpu->arch.virtual_tsc_mult,
2104                                       vcpu->arch.virtual_tsc_shift);
2105         tsc += vcpu->arch.this_tsc_write;
2106         return tsc;
2107 }
2108
2109 static inline int gtod_is_based_on_tsc(int mode)
2110 {
2111         return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
2112 }
2113
2114 static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
2115 {
2116 #ifdef CONFIG_X86_64
2117         bool vcpus_matched;
2118         struct kvm_arch *ka = &vcpu->kvm->arch;
2119         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2120
2121         vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
2122                          atomic_read(&vcpu->kvm->online_vcpus));
2123
2124         /*
2125          * Once the masterclock is enabled, always perform request in
2126          * order to update it.
2127          *
2128          * In order to enable masterclock, the host clocksource must be TSC
2129          * and the vcpus need to have matched TSCs.  When that happens,
2130          * perform request to enable masterclock.
2131          */
2132         if (ka->use_master_clock ||
2133             (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
2134                 kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2135
2136         trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
2137                             atomic_read(&vcpu->kvm->online_vcpus),
2138                             ka->use_master_clock, gtod->clock.vclock_mode);
2139 #endif
2140 }
2141
2142 /*
2143  * Multiply tsc by a fixed point number represented by ratio.
2144  *
2145  * The most significant 64-N bits (mult) of ratio represent the
2146  * integral part of the fixed point number; the remaining N bits
2147  * (frac) represent the fractional part, ie. ratio represents a fixed
2148  * point number (mult + frac * 2^(-N)).
2149  *
2150  * N equals to kvm_tsc_scaling_ratio_frac_bits.
2151  */
2152 static inline u64 __scale_tsc(u64 ratio, u64 tsc)
2153 {
2154         return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
2155 }
2156
2157 u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
2158 {
2159         u64 _tsc = tsc;
2160         u64 ratio = vcpu->arch.tsc_scaling_ratio;
2161
2162         if (ratio != kvm_default_tsc_scaling_ratio)
2163                 _tsc = __scale_tsc(ratio, tsc);
2164
2165         return _tsc;
2166 }
2167 EXPORT_SYMBOL_GPL(kvm_scale_tsc);
2168
2169 static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
2170 {
2171         u64 tsc;
2172
2173         tsc = kvm_scale_tsc(vcpu, rdtsc());
2174
2175         return target_tsc - tsc;
2176 }
2177
2178 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
2179 {
2180         return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
2181 }
2182 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
2183
2184 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2185 {
2186         vcpu->arch.l1_tsc_offset = offset;
2187         vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset);
2188 }
2189
2190 static inline bool kvm_check_tsc_unstable(void)
2191 {
2192 #ifdef CONFIG_X86_64
2193         /*
2194          * TSC is marked unstable when we're running on Hyper-V,
2195          * 'TSC page' clocksource is good.
2196          */
2197         if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
2198                 return false;
2199 #endif
2200         return check_tsc_unstable();
2201 }
2202
2203 static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
2204 {
2205         struct kvm *kvm = vcpu->kvm;
2206         u64 offset, ns, elapsed;
2207         unsigned long flags;
2208         bool matched;
2209         bool already_matched;
2210         bool synchronizing = false;
2211
2212         raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
2213         offset = kvm_compute_tsc_offset(vcpu, data);
2214         ns = get_kvmclock_base_ns();
2215         elapsed = ns - kvm->arch.last_tsc_nsec;
2216
2217         if (vcpu->arch.virtual_tsc_khz) {
2218                 if (data == 0) {
2219                         /*
2220                          * detection of vcpu initialization -- need to sync
2221                          * with other vCPUs. This particularly helps to keep
2222                          * kvm_clock stable after CPU hotplug
2223                          */
2224                         synchronizing = true;
2225                 } else {
2226                         u64 tsc_exp = kvm->arch.last_tsc_write +
2227                                                 nsec_to_cycles(vcpu, elapsed);
2228                         u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
2229                         /*
2230                          * Special case: TSC write with a small delta (1 second)
2231                          * of virtual cycle time against real time is
2232                          * interpreted as an attempt to synchronize the CPU.
2233                          */
2234                         synchronizing = data < tsc_exp + tsc_hz &&
2235                                         data + tsc_hz > tsc_exp;
2236                 }
2237         }
2238
2239         /*
2240          * For a reliable TSC, we can match TSC offsets, and for an unstable
2241          * TSC, we add elapsed time in this computation.  We could let the
2242          * compensation code attempt to catch up if we fall behind, but
2243          * it's better to try to match offsets from the beginning.
2244          */
2245         if (synchronizing &&
2246             vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
2247                 if (!kvm_check_tsc_unstable()) {
2248                         offset = kvm->arch.cur_tsc_offset;
2249                 } else {
2250                         u64 delta = nsec_to_cycles(vcpu, elapsed);
2251                         data += delta;
2252                         offset = kvm_compute_tsc_offset(vcpu, data);
2253                 }
2254                 matched = true;
2255                 already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
2256         } else {
2257                 /*
2258                  * We split periods of matched TSC writes into generations.
2259                  * For each generation, we track the original measured
2260                  * nanosecond time, offset, and write, so if TSCs are in
2261                  * sync, we can match exact offset, and if not, we can match
2262                  * exact software computation in compute_guest_tsc()
2263                  *
2264                  * These values are tracked in kvm->arch.cur_xxx variables.
2265                  */
2266                 kvm->arch.cur_tsc_generation++;
2267                 kvm->arch.cur_tsc_nsec = ns;
2268                 kvm->arch.cur_tsc_write = data;
2269                 kvm->arch.cur_tsc_offset = offset;
2270                 matched = false;
2271         }
2272
2273         /*
2274          * We also track th most recent recorded KHZ, write and time to
2275          * allow the matching interval to be extended at each write.
2276          */
2277         kvm->arch.last_tsc_nsec = ns;
2278         kvm->arch.last_tsc_write = data;
2279         kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
2280
2281         vcpu->arch.last_guest_tsc = data;
2282
2283         /* Keep track of which generation this VCPU has synchronized to */
2284         vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
2285         vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
2286         vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
2287
2288         kvm_vcpu_write_tsc_offset(vcpu, offset);
2289         raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
2290
2291         spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
2292         if (!matched) {
2293                 kvm->arch.nr_vcpus_matched_tsc = 0;
2294         } else if (!already_matched) {
2295                 kvm->arch.nr_vcpus_matched_tsc++;
2296         }
2297
2298         kvm_track_tsc_matching(vcpu);
2299         spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
2300 }
2301
2302 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
2303                                            s64 adjustment)
2304 {
2305         u64 tsc_offset = vcpu->arch.l1_tsc_offset;
2306         kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
2307 }
2308
2309 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
2310 {
2311         if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
2312                 WARN_ON(adjustment < 0);
2313         adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
2314         adjust_tsc_offset_guest(vcpu, adjustment);
2315 }
2316
2317 #ifdef CONFIG_X86_64
2318
2319 static u64 read_tsc(void)
2320 {
2321         u64 ret = (u64)rdtsc_ordered();
2322         u64 last = pvclock_gtod_data.clock.cycle_last;
2323
2324         if (likely(ret >= last))
2325                 return ret;
2326
2327         /*
2328          * GCC likes to generate cmov here, but this branch is extremely
2329          * predictable (it's just a function of time and the likely is
2330          * very likely) and there's a data dependence, so force GCC
2331          * to generate a branch instead.  I don't barrier() because
2332          * we don't actually need a barrier, and if this function
2333          * ever gets inlined it will generate worse code.
2334          */
2335         asm volatile ("");
2336         return last;
2337 }
2338
2339 static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
2340                           int *mode)
2341 {
2342         long v;
2343         u64 tsc_pg_val;
2344
2345         switch (clock->vclock_mode) {
2346         case VDSO_CLOCKMODE_HVCLOCK:
2347                 tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
2348                                                   tsc_timestamp);
2349                 if (tsc_pg_val != U64_MAX) {
2350                         /* TSC page valid */
2351                         *mode = VDSO_CLOCKMODE_HVCLOCK;
2352                         v = (tsc_pg_val - clock->cycle_last) &
2353                                 clock->mask;
2354                 } else {
2355                         /* TSC page invalid */
2356                         *mode = VDSO_CLOCKMODE_NONE;
2357                 }
2358                 break;
2359         case VDSO_CLOCKMODE_TSC:
2360                 *mode = VDSO_CLOCKMODE_TSC;
2361                 *tsc_timestamp = read_tsc();
2362                 v = (*tsc_timestamp - clock->cycle_last) &
2363                         clock->mask;
2364                 break;
2365         default:
2366                 *mode = VDSO_CLOCKMODE_NONE;
2367         }
2368
2369         if (*mode == VDSO_CLOCKMODE_NONE)
2370                 *tsc_timestamp = v = 0;
2371
2372         return v * clock->mult;
2373 }
2374
2375 static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
2376 {
2377         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2378         unsigned long seq;
2379         int mode;
2380         u64 ns;
2381
2382         do {
2383                 seq = read_seqcount_begin(&gtod->seq);
2384                 ns = gtod->raw_clock.base_cycles;
2385                 ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
2386                 ns >>= gtod->raw_clock.shift;
2387                 ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
2388         } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2389         *t = ns;
2390
2391         return mode;
2392 }
2393
2394 static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
2395 {
2396         struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2397         unsigned long seq;
2398         int mode;
2399         u64 ns;
2400
2401         do {
2402                 seq = read_seqcount_begin(&gtod->seq);
2403                 ts->tv_sec = gtod->wall_time_sec;
2404                 ns = gtod->clock.base_cycles;
2405                 ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
2406                 ns >>= gtod->clock.shift;
2407         } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2408
2409         ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
2410         ts->tv_nsec = ns;
2411
2412         return mode;
2413 }
2414
2415 /* returns true if host is using TSC based clocksource */
2416 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
2417 {
2418         /* checked again under seqlock below */
2419         if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
2420                 return false;
2421
2422         return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
2423                                                       tsc_timestamp));
2424 }
2425
2426 /* returns true if host is using TSC based clocksource */
2427 static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
2428                                            u64 *tsc_timestamp)
2429 {
2430         /* checked again under seqlock below */
2431         if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
2432                 return false;
2433
2434         return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
2435 }
2436 #endif
2437
2438 /*
2439  *
2440  * Assuming a stable TSC across physical CPUS, and a stable TSC
2441  * across virtual CPUs, the following condition is possible.
2442  * Each numbered line represents an event visible to both
2443  * CPUs at the next numbered event.
2444  *
2445  * "timespecX" represents host monotonic time. "tscX" represents
2446  * RDTSC value.
2447  *
2448  *              VCPU0 on CPU0           |       VCPU1 on CPU1
2449  *
2450  * 1.  read timespec0,tsc0
2451  * 2.                                   | timespec1 = timespec0 + N
2452  *                                      | tsc1 = tsc0 + M
2453  * 3. transition to guest               | transition to guest
2454  * 4. ret0 = timespec0 + (rdtsc - tsc0) |
2455  * 5.                                   | ret1 = timespec1 + (rdtsc - tsc1)
2456  *                                      | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
2457  *
2458  * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
2459  *
2460  *      - ret0 < ret1
2461  *      - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
2462  *              ...
2463  *      - 0 < N - M => M < N
2464  *
2465  * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
2466  * always the case (the difference between two distinct xtime instances
2467  * might be smaller then the difference between corresponding TSC reads,
2468  * when updating guest vcpus pvclock areas).
2469  *
2470  * To avoid that problem, do not allow visibility of distinct
2471  * system_timestamp/tsc_timestamp values simultaneously: use a master
2472  * copy of host monotonic time values. Update that master copy
2473  * in lockstep.
2474  *
2475  * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
2476  *
2477  */
2478
2479 static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
2480 {
2481 #ifdef CONFIG_X86_64
2482         struct kvm_arch *ka = &kvm->arch;
2483         int vclock_mode;
2484         bool host_tsc_clocksource, vcpus_matched;
2485
2486         vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
2487                         atomic_read(&kvm->online_vcpus));
2488
2489         /*
2490          * If the host uses TSC clock, then passthrough TSC as stable
2491          * to the guest.
2492          */
2493         host_tsc_clocksource = kvm_get_time_and_clockread(
2494                                         &ka->master_kernel_ns,
2495                                         &ka->master_cycle_now);
2496
2497         ka->use_master_clock = host_tsc_clocksource && vcpus_matched
2498                                 && !ka->backwards_tsc_observed
2499                                 && !ka->boot_vcpu_runs_old_kvmclock;
2500
2501         if (ka->use_master_clock)
2502                 atomic_set(&kvm_guest_has_master_clock, 1);
2503
2504         vclock_mode = pvclock_gtod_data.clock.vclock_mode;
2505         trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
2506                                         vcpus_matched);
2507 #endif
2508 }
2509
2510 void kvm_make_mclock_inprogress_request(struct kvm *kvm)
2511 {
2512         kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
2513 }
2514
2515 static void kvm_gen_update_masterclock(struct kvm *kvm)
2516 {
2517 #ifdef CONFIG_X86_64
2518         int i;
2519         struct kvm_vcpu *vcpu;
2520         struct kvm_arch *ka = &kvm->arch;
2521
2522         spin_lock(&ka->pvclock_gtod_sync_lock);
2523         kvm_make_mclock_inprogress_request(kvm);
2524         /* no guest entries from this point */
2525         pvclock_update_vm_gtod_copy(kvm);
2526
2527         kvm_for_each_vcpu(i, vcpu, kvm)
2528                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2529
2530         /* guest entries allowed */
2531         kvm_for_each_vcpu(i, vcpu, kvm)
2532                 kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
2533
2534         spin_unlock(&ka->pvclock_gtod_sync_lock);
2535 #endif
2536 }
2537
2538 u64 get_kvmclock_ns(struct kvm *kvm)
2539 {
2540         struct kvm_arch *ka = &kvm->arch;
2541         struct pvclock_vcpu_time_info hv_clock;
2542         u64 ret;
2543
2544         spin_lock(&ka->pvclock_gtod_sync_lock);
2545         if (!ka->use_master_clock) {
2546                 spin_unlock(&ka->pvclock_gtod_sync_lock);
2547                 return get_kvmclock_base_ns() + ka->kvmclock_offset;
2548         }
2549
2550         hv_clock.tsc_timestamp = ka->master_cycle_now;
2551         hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
2552         spin_unlock(&ka->pvclock_gtod_sync_lock);
2553
2554         /* both __this_cpu_read() and rdtsc() should be on the same cpu */
2555         get_cpu();
2556
2557         if (__this_cpu_read(cpu_tsc_khz)) {
2558                 kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
2559                                    &hv_clock.tsc_shift,
2560                                    &hv_clock.tsc_to_system_mul);
2561                 ret = __pvclock_read_cycles(&hv_clock, rdtsc());
2562         } else
2563                 ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
2564
2565         put_cpu();
2566
2567         return ret;
2568 }
2569
2570 static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
2571 {
2572         struct kvm_vcpu_arch *vcpu = &v->arch;
2573         struct pvclock_vcpu_time_info guest_hv_clock;
2574
2575         if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
2576                 &guest_hv_clock, sizeof(guest_hv_clock))))
2577                 return;
2578
2579         /* This VCPU is paused, but it's legal for a guest to read another
2580          * VCPU's kvmclock, so we really have to follow the specification where
2581          * it says that version is odd if data is being modified, and even after
2582          * it is consistent.
2583          *
2584          * Version field updates must be kept separate.  This is because
2585          * kvm_write_guest_cached might use a "rep movs" instruction, and
2586          * writes within a string instruction are weakly ordered.  So there
2587          * are three writes overall.
2588          *
2589          * As a small optimization, only write the version field in the first
2590          * and third write.  The vcpu->pv_time cache is still valid, because the
2591          * version field is the first in the struct.
2592          */
2593         BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
2594
2595         if (guest_hv_clock.version & 1)
2596                 ++guest_hv_clock.version;  /* first time write, random junk */
2597
2598         vcpu->hv_clock.version = guest_hv_clock.version + 1;
2599         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2600                                 &vcpu->hv_clock,
2601                                 sizeof(vcpu->hv_clock.version));
2602
2603         smp_wmb();
2604
2605         /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
2606         vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
2607
2608         if (vcpu->pvclock_set_guest_stopped_request) {
2609                 vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
2610                 vcpu->pvclock_set_guest_stopped_request = false;
2611         }
2612
2613         trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
2614
2615         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2616                                 &vcpu->hv_clock,
2617                                 sizeof(vcpu->hv_clock));
2618
2619         smp_wmb();
2620
2621         vcpu->hv_clock.version++;
2622         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
2623                                 &vcpu->hv_clock,
2624                                 sizeof(vcpu->hv_clock.version));
2625 }
2626
2627 static int kvm_guest_time_update(struct kvm_vcpu *v)
2628 {
2629         unsigned long flags, tgt_tsc_khz;
2630         struct kvm_vcpu_arch *vcpu = &v->arch;
2631         struct kvm_arch *ka = &v->kvm->arch;
2632         s64 kernel_ns;
2633         u64 tsc_timestamp, host_tsc;
2634         u8 pvclock_flags;
2635         bool use_master_clock;
2636
2637         kernel_ns = 0;
2638         host_tsc = 0;
2639
2640         /*
2641          * If the host uses TSC clock, then passthrough TSC as stable
2642          * to the guest.
2643          */
2644         spin_lock(&ka->pvclock_gtod_sync_lock);
2645         use_master_clock = ka->use_master_clock;
2646         if (use_master_clock) {
2647                 host_tsc = ka->master_cycle_now;
2648                 kernel_ns = ka->master_kernel_ns;
2649         }
2650         spin_unlock(&ka->pvclock_gtod_sync_lock);
2651
2652         /* Keep irq disabled to prevent changes to the clock */
2653         local_irq_save(flags);
2654         tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
2655         if (unlikely(tgt_tsc_khz == 0)) {
2656                 local_irq_restore(flags);
2657                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2658                 return 1;
2659         }
2660         if (!use_master_clock) {
2661                 host_tsc = rdtsc();
2662                 kernel_ns = get_kvmclock_base_ns();
2663         }
2664
2665         tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
2666
2667         /*
2668          * We may have to catch up the TSC to match elapsed wall clock
2669          * time for two reasons, even if kvmclock is used.
2670          *   1) CPU could have been running below the maximum TSC rate
2671          *   2) Broken TSC compensation resets the base at each VCPU
2672          *      entry to avoid unknown leaps of TSC even when running
2673          *      again on the same CPU.  This may cause apparent elapsed
2674          *      time to disappear, and the guest to stand still or run
2675          *      very slowly.
2676          */
2677         if (vcpu->tsc_catchup) {
2678                 u64 tsc = compute_guest_tsc(v, kernel_ns);
2679                 if (tsc > tsc_timestamp) {
2680                         adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
2681                         tsc_timestamp = tsc;
2682                 }
2683         }
2684
2685         local_irq_restore(flags);
2686
2687         /* With all the info we got, fill in the values */
2688
2689         if (kvm_has_tsc_control)
2690                 tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
2691
2692         if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
2693                 kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
2694                                    &vcpu->hv_clock.tsc_shift,
2695                                    &vcpu->hv_clock.tsc_to_system_mul);
2696                 vcpu->hw_tsc_khz = tgt_tsc_khz;
2697         }
2698
2699         vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
2700         vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
2701         vcpu->last_guest_tsc = tsc_timestamp;
2702
2703         /* If the host uses TSC clocksource, then it is stable */
2704         pvclock_flags = 0;
2705         if (use_master_clock)
2706                 pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
2707
2708         vcpu->hv_clock.flags = pvclock_flags;
2709
2710         if (vcpu->pv_time_enabled)
2711                 kvm_setup_pvclock_page(v);
2712         if (v == kvm_get_vcpu(v->kvm, 0))
2713                 kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
2714         return 0;
2715 }
2716
2717 /*
2718  * kvmclock updates which are isolated to a given vcpu, such as
2719  * vcpu->cpu migration, should not allow system_timestamp from
2720  * the rest of the vcpus to remain static. Otherwise ntp frequency
2721  * correction applies to one vcpu's system_timestamp but not
2722  * the others.
2723  *
2724  * So in those cases, request a kvmclock update for all vcpus.
2725  * We need to rate-limit these requests though, as they can
2726  * considerably slow guests that have a large number of vcpus.
2727  * The time for a remote vcpu to update its kvmclock is bound
2728  * by the delay we use to rate-limit the updates.
2729  */
2730
2731 #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
2732
2733 static void kvmclock_update_fn(struct work_struct *work)
2734 {
2735         int i;
2736         struct delayed_work *dwork = to_delayed_work(work);
2737         struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2738                                            kvmclock_update_work);
2739         struct kvm *kvm = container_of(ka, struct kvm, arch);
2740         struct kvm_vcpu *vcpu;
2741
2742         kvm_for_each_vcpu(i, vcpu, kvm) {
2743                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2744                 kvm_vcpu_kick(vcpu);
2745         }
2746 }
2747
2748 static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
2749 {
2750         struct kvm *kvm = v->kvm;
2751
2752         kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
2753         schedule_delayed_work(&kvm->arch.kvmclock_update_work,
2754                                         KVMCLOCK_UPDATE_DELAY);
2755 }
2756
2757 #define KVMCLOCK_SYNC_PERIOD (300 * HZ)
2758
2759 static void kvmclock_sync_fn(struct work_struct *work)
2760 {
2761         struct delayed_work *dwork = to_delayed_work(work);
2762         struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
2763                                            kvmclock_sync_work);
2764         struct kvm *kvm = container_of(ka, struct kvm, arch);
2765
2766         if (!kvmclock_periodic_sync)
2767                 return;
2768
2769         schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
2770         schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
2771                                         KVMCLOCK_SYNC_PERIOD);
2772 }
2773
2774 /*
2775  * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
2776  */
2777 static bool can_set_mci_status(struct kvm_vcpu *vcpu)
2778 {
2779         /* McStatusWrEn enabled? */
2780         if (guest_cpuid_is_amd_or_hygon(vcpu))
2781                 return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
2782
2783         return false;
2784 }
2785
2786 static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2787 {
2788         u64 mcg_cap = vcpu->arch.mcg_cap;
2789         unsigned bank_num = mcg_cap & 0xff;
2790         u32 msr = msr_info->index;
2791         u64 data = msr_info->data;
2792
2793         switch (msr) {
2794         case MSR_IA32_MCG_STATUS:
2795                 vcpu->arch.mcg_status = data;
2796                 break;
2797         case MSR_IA32_MCG_CTL:
2798                 if (!(mcg_cap & MCG_CTL_P) &&
2799                     (data || !msr_info->host_initiated))
2800                         return 1;
2801                 if (data != 0 && data != ~(u64)0)
2802                         return 1;
2803                 vcpu->arch.mcg_ctl = data;
2804                 break;
2805         default:
2806                 if (msr >= MSR_IA32_MC0_CTL &&
2807                     msr < MSR_IA32_MCx_CTL(bank_num)) {
2808                         u32 offset = array_index_nospec(
2809                                 msr - MSR_IA32_MC0_CTL,
2810                                 MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
2811
2812                         /* only 0 or all 1s can be written to IA32_MCi_CTL
2813                          * some Linux kernels though clear bit 10 in bank 4 to
2814                          * workaround a BIOS/GART TBL issue on AMD K8s, ignore
2815                          * this to avoid an uncatched #GP in the guest
2816                          */
2817                         if ((offset & 0x3) == 0 &&
2818                             data != 0 && (data | (1 << 10)) != ~(u64)0)
2819                                 return -1;
2820
2821                         /* MCi_STATUS */
2822                         if (!msr_info->host_initiated &&
2823                             (offset & 0x3) == 1 && data != 0) {
2824                                 if (!can_set_mci_status(vcpu))
2825                                         return -1;
2826                         }
2827
2828                         vcpu->arch.mce_banks[offset] = data;
2829                         break;
2830                 }
2831                 return 1;
2832         }
2833         return 0;
2834 }
2835
2836 static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
2837 {
2838         struct kvm *kvm = vcpu->kvm;
2839         int lm = is_long_mode(vcpu);
2840         u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
2841                 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
2842         u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
2843                 : kvm->arch.xen_hvm_config.blob_size_32;
2844         u32 page_num = data & ~PAGE_MASK;
2845         u64 page_addr = data & PAGE_MASK;
2846         u8 *page;
2847
2848         if (page_num >= blob_size)
2849                 return 1;
2850
2851         page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
2852         if (IS_ERR(page))
2853                 return PTR_ERR(page);
2854
2855         if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
2856                 kfree(page);
2857                 return 1;
2858         }
2859         return 0;
2860 }
2861
2862 static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
2863 {
2864         u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
2865
2866         return (vcpu->arch.apf.msr_en_val & mask) == mask;
2867 }
2868
2869 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
2870 {
2871         gpa_t gpa = data & ~0x3f;
2872
2873         /* Bits 4:5 are reserved, Should be zero */
2874         if (data & 0x30)
2875                 return 1;
2876
2877         if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
2878             (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
2879                 return 1;
2880
2881         if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
2882             (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
2883                 return 1;
2884
2885         if (!lapic_in_kernel(vcpu))
2886                 return data ? 1 : 0;
2887
2888         vcpu->arch.apf.msr_en_val = data;
2889
2890         if (!kvm_pv_async_pf_enabled(vcpu)) {
2891                 kvm_clear_async_pf_completion_queue(vcpu);
2892                 kvm_async_pf_hash_reset(vcpu);
2893                 return 0;
2894         }
2895
2896         if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
2897                                         sizeof(u64)))
2898                 return 1;
2899
2900         vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
2901         vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
2902
2903         kvm_async_pf_wakeup_all(vcpu);
2904
2905         return 0;
2906 }
2907
2908 static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
2909 {
2910         /* Bits 8-63 are reserved */
2911         if (data >> 8)
2912                 return 1;
2913
2914         if (!lapic_in_kernel(vcpu))
2915                 return 1;
2916
2917         vcpu->arch.apf.msr_int_val = data;
2918
2919         vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
2920
2921         return 0;
2922 }
2923
2924 static void kvmclock_reset(struct kvm_vcpu *vcpu)
2925 {
2926         vcpu->arch.pv_time_enabled = false;
2927         vcpu->arch.time = 0;
2928 }
2929
2930 static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
2931 {
2932         ++vcpu->stat.tlb_flush;
2933         kvm_x86_ops.tlb_flush_all(vcpu);
2934 }
2935
2936 static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
2937 {
2938         ++vcpu->stat.tlb_flush;
2939         kvm_x86_ops.tlb_flush_guest(vcpu);
2940 }
2941
2942 static void record_steal_time(struct kvm_vcpu *vcpu)
2943 {
2944         struct kvm_host_map map;
2945         struct kvm_steal_time *st;
2946
2947         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
2948                 return;
2949
2950         /* -EAGAIN is returned in atomic context so we can just return. */
2951         if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
2952                         &map, &vcpu->arch.st.cache, false))
2953                 return;
2954
2955         st = map.hva +
2956                 offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
2957
2958         /*
2959          * Doing a TLB flush here, on the guest's behalf, can avoid
2960          * expensive IPIs.
2961          */
2962         if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
2963                 trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
2964                                        st->preempted & KVM_VCPU_FLUSH_TLB);
2965                 if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
2966                         kvm_vcpu_flush_tlb_guest(vcpu);
2967         }
2968
2969         vcpu->arch.st.preempted = 0;
2970
2971         if (st->version & 1)
2972                 st->version += 1;  /* first time write, random junk */
2973
2974         st->version += 1;
2975
2976         smp_wmb();
2977
2978         st->steal += current->sched_info.run_delay -
2979                 vcpu->arch.st.last_steal;
2980         vcpu->arch.st.last_steal = current->sched_info.run_delay;
2981
2982         smp_wmb();
2983
2984         st->version += 1;
2985
2986         kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
2987 }
2988
2989 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2990 {
2991         bool pr = false;
2992         u32 msr = msr_info->index;
2993         u64 data = msr_info->data;
2994
2995         switch (msr) {
2996         case MSR_AMD64_NB_CFG:
2997         case MSR_IA32_UCODE_WRITE:
2998         case MSR_VM_HSAVE_PA:
2999         case MSR_AMD64_PATCH_LOADER:
3000         case MSR_AMD64_BU_CFG2:
3001         case MSR_AMD64_DC_CFG:
3002         case MSR_F15H_EX_CFG:
3003                 break;
3004
3005         case MSR_IA32_UCODE_REV:
3006                 if (msr_info->host_initiated)
3007                         vcpu->arch.microcode_version = data;
3008                 break;
3009         case MSR_IA32_ARCH_CAPABILITIES:
3010                 if (!msr_info->host_initiated)
3011                         return 1;
3012                 vcpu->arch.arch_capabilities = data;
3013                 break;
3014         case MSR_IA32_PERF_CAPABILITIES: {
3015                 struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
3016
3017                 if (!msr_info->host_initiated)
3018                         return 1;
3019                 if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent))
3020                         return 1;
3021                 if (data & ~msr_ent.data)
3022                         return 1;
3023
3024                 vcpu->arch.perf_capabilities = data;
3025
3026                 return 0;
3027                 }
3028         case MSR_EFER:
3029                 return set_efer(vcpu, msr_info);
3030         case MSR_K7_HWCR:
3031                 data &= ~(u64)0x40;     /* ignore flush filter disable */
3032                 data &= ~(u64)0x100;    /* ignore ignne emulation enable */
3033                 data &= ~(u64)0x8;      /* ignore TLB cache disable */
3034
3035                 /* Handle McStatusWrEn */
3036                 if (data == BIT_ULL(18)) {
3037                         vcpu->arch.msr_hwcr = data;
3038                 } else if (data != 0) {
3039                         vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
3040                                     data);
3041                         return 1;
3042                 }
3043                 break;
3044         case MSR_FAM10H_MMIO_CONF_BASE:
3045                 if (data != 0) {
3046                         vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
3047                                     "0x%llx\n", data);
3048                         return 1;
3049                 }
3050                 break;
3051         case MSR_IA32_DEBUGCTLMSR:
3052                 if (!data) {
3053                         /* We support the non-activated case already */
3054                         break;
3055                 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
3056                         /* Values other than LBR and BTF are vendor-specific,
3057                            thus reserved and should throw a #GP */
3058                         return 1;
3059                 } else if (report_ignored_msrs)
3060                         vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
3061                                     __func__, data);
3062                 break;
3063         case 0x200 ... 0x2ff:
3064                 return kvm_mtrr_set_msr(vcpu, msr, data);
3065         case MSR_IA32_APICBASE:
3066                 return kvm_set_apic_base(vcpu, msr_info);
3067         case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
3068                 return kvm_x2apic_msr_write(vcpu, msr, data);
3069         case MSR_IA32_TSCDEADLINE:
3070                 kvm_set_lapic_tscdeadline_msr(vcpu, data);
3071                 break;
3072         case MSR_IA32_TSC_ADJUST:
3073                 if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
3074                         if (!msr_info->host_initiated) {
3075                                 s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
3076                                 adjust_tsc_offset_guest(vcpu, adj);
3077                         }
3078                         vcpu->arch.ia32_tsc_adjust_msr = data;
3079                 }
3080                 break;
3081         case MSR_IA32_MISC_ENABLE:
3082                 if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
3083                     ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
3084                         if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
3085                                 return 1;
3086                         vcpu->arch.ia32_misc_enable_msr = data;
3087                         kvm_update_cpuid_runtime(vcpu);
3088                 } else {
3089                         vcpu->arch.ia32_misc_enable_msr = data;
3090                 }
3091                 break;
3092         case MSR_IA32_SMBASE:
3093                 if (!msr_info->host_initiated)
3094                         return 1;
3095                 vcpu->arch.smbase = data;
3096                 break;
3097         case MSR_IA32_POWER_CTL:
3098                 vcpu->arch.msr_ia32_power_ctl = data;
3099                 break;
3100         case MSR_IA32_TSC:
3101                 if (msr_info->host_initiated) {
3102                         kvm_synchronize_tsc(vcpu, data);
3103                 } else {
3104                         u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
3105                         adjust_tsc_offset_guest(vcpu, adj);
3106                         vcpu->arch.ia32_tsc_adjust_msr += adj;
3107                 }
3108                 break;
3109         case MSR_IA32_XSS:
3110                 if (!msr_info->host_initiated &&
3111                     !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3112                         return 1;
3113                 /*
3114                  * KVM supports exposing PT to the guest, but does not support
3115                  * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
3116                  * XSAVES/XRSTORS to save/restore PT MSRs.
3117                  */
3118                 if (data & ~supported_xss)
3119                         return 1;
3120                 vcpu->arch.ia32_xss = data;
3121                 break;
3122         case MSR_SMI_COUNT:
3123                 if (!msr_info->host_initiated)
3124                         return 1;
3125                 vcpu->arch.smi_count = data;
3126                 break;
3127         case MSR_KVM_WALL_CLOCK_NEW:
3128                 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3129                         return 1;
3130
3131                 kvm_write_wall_clock(vcpu->kvm, data);
3132                 break;
3133         case MSR_KVM_WALL_CLOCK:
3134                 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3135                         return 1;
3136
3137                 kvm_write_wall_clock(vcpu->kvm, data);
3138                 break;
3139         case MSR_KVM_SYSTEM_TIME_NEW:
3140                 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3141                         return 1;
3142
3143                 kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
3144                 break;
3145         case MSR_KVM_SYSTEM_TIME:
3146                 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3147                         return 1;
3148
3149                 kvm_write_system_time(vcpu, data, true,  msr_info->host_initiated);
3150                 break;
3151         case MSR_KVM_ASYNC_PF_EN:
3152                 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3153                         return 1;
3154
3155                 if (kvm_pv_enable_async_pf(vcpu, data))
3156                         return 1;
3157                 break;
3158         case MSR_KVM_ASYNC_PF_INT:
3159                 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3160                         return 1;
3161
3162                 if (kvm_pv_enable_async_pf_int(vcpu, data))
3163                         return 1;
3164                 break;
3165         case MSR_KVM_ASYNC_PF_ACK:
3166                 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3167                         return 1;
3168                 if (data & 0x1) {
3169                         vcpu->arch.apf.pageready_pending = false;
3170                         kvm_check_async_pf_completion(vcpu);
3171                 }
3172                 break;
3173         case MSR_KVM_STEAL_TIME:
3174                 if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
3175                         return 1;
3176
3177                 if (unlikely(!sched_info_on()))
3178                         return 1;
3179
3180                 if (data & KVM_STEAL_RESERVED_MASK)
3181                         return 1;
3182
3183                 vcpu->arch.st.msr_val = data;
3184
3185                 if (!(data & KVM_MSR_ENABLED))
3186                         break;
3187
3188                 kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
3189
3190                 break;
3191         case MSR_KVM_PV_EOI_EN:
3192                 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
3193                         return 1;
3194
3195                 if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
3196                         return 1;
3197                 break;
3198
3199         case MSR_KVM_POLL_CONTROL:
3200                 if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
3201                         return 1;
3202
3203                 /* only enable bit supported */
3204                 if (data & (-1ULL << 1))
3205                         return 1;
3206
3207                 vcpu->arch.msr_kvm_poll_control = data;
3208                 break;
3209
3210         case MSR_IA32_MCG_CTL:
3211         case MSR_IA32_MCG_STATUS:
3212         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3213                 return set_msr_mce(vcpu, msr_info);
3214
3215         case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
3216         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
3217                 pr = true;
3218                 fallthrough;
3219         case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
3220         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
3221                 if (kvm_pmu_is_valid_msr(vcpu, msr))
3222                         return kvm_pmu_set_msr(vcpu, msr_info);
3223
3224                 if (pr || data != 0)
3225                         vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
3226                                     "0x%x data 0x%llx\n", msr, data);
3227                 break;
3228         case MSR_K7_CLK_CTL:
3229                 /*
3230                  * Ignore all writes to this no longer documented MSR.
3231                  * Writes are only relevant for old K7 processors,
3232                  * all pre-dating SVM, but a recommended workaround from
3233                  * AMD for these chips. It is possible to specify the
3234                  * affected processor models on the command line, hence
3235                  * the need to ignore the workaround.
3236                  */
3237                 break;
3238         case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
3239         case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
3240         case HV_X64_MSR_SYNDBG_OPTIONS:
3241         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
3242         case HV_X64_MSR_CRASH_CTL:
3243         case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
3244         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
3245         case HV_X64_MSR_TSC_EMULATION_CONTROL:
3246         case HV_X64_MSR_TSC_EMULATION_STATUS:
3247                 return kvm_hv_set_msr_common(vcpu, msr, data,
3248                                              msr_info->host_initiated);
3249         case MSR_IA32_BBL_CR_CTL3:
3250                 /* Drop writes to this legacy MSR -- see rdmsr
3251                  * counterpart for further detail.
3252                  */
3253                 if (report_ignored_msrs)
3254                         vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
3255                                 msr, data);
3256                 break;
3257         case MSR_AMD64_OSVW_ID_LENGTH:
3258                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3259                         return 1;
3260                 vcpu->arch.osvw.length = data;
3261                 break;
3262         case MSR_AMD64_OSVW_STATUS:
3263                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3264                         return 1;
3265                 vcpu->arch.osvw.status = data;
3266                 break;
3267         case MSR_PLATFORM_INFO:
3268                 if (!msr_info->host_initiated ||
3269                     (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
3270                      cpuid_fault_enabled(vcpu)))
3271                         return 1;
3272                 vcpu->arch.msr_platform_info = data;
3273                 break;
3274         case MSR_MISC_FEATURES_ENABLES:
3275                 if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
3276                     (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
3277                      !supports_cpuid_fault(vcpu)))
3278                         return 1;
3279                 vcpu->arch.msr_misc_features_enables = data;
3280                 break;
3281         default:
3282                 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
3283                         return xen_hvm_config(vcpu, data);
3284                 if (kvm_pmu_is_valid_msr(vcpu, msr))
3285                         return kvm_pmu_set_msr(vcpu, msr_info);
3286                 return KVM_MSR_RET_INVALID;
3287         }
3288         return 0;
3289 }
3290 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
3291
3292 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
3293 {
3294         u64 data;
3295         u64 mcg_cap = vcpu->arch.mcg_cap;
3296         unsigned bank_num = mcg_cap & 0xff;
3297
3298         switch (msr) {
3299         case MSR_IA32_P5_MC_ADDR:
3300         case MSR_IA32_P5_MC_TYPE:
3301                 data = 0;
3302                 break;
3303         case MSR_IA32_MCG_CAP:
3304                 data = vcpu->arch.mcg_cap;
3305                 break;
3306         case MSR_IA32_MCG_CTL:
3307                 if (!(mcg_cap & MCG_CTL_P) && !host)
3308                         return 1;
3309                 data = vcpu->arch.mcg_ctl;
3310                 break;
3311         case MSR_IA32_MCG_STATUS:
3312                 data = vcpu->arch.mcg_status;
3313                 break;
3314         default:
3315                 if (msr >= MSR_IA32_MC0_CTL &&
3316                     msr < MSR_IA32_MCx_CTL(bank_num)) {
3317                         u32 offset = array_index_nospec(
3318                                 msr - MSR_IA32_MC0_CTL,
3319                                 MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
3320
3321                         data = vcpu->arch.mce_banks[offset];
3322                         break;
3323                 }
3324                 return 1;
3325         }
3326         *pdata = data;
3327         return 0;
3328 }
3329
3330 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3331 {
3332         switch (msr_info->index) {
3333         case MSR_IA32_PLATFORM_ID:
3334         case MSR_IA32_EBL_CR_POWERON:
3335         case MSR_IA32_DEBUGCTLMSR:
3336         case MSR_IA32_LASTBRANCHFROMIP:
3337         case MSR_IA32_LASTBRANCHTOIP:
3338         case MSR_IA32_LASTINTFROMIP:
3339         case MSR_IA32_LASTINTTOIP:
3340         case MSR_K8_SYSCFG:
3341         case MSR_K8_TSEG_ADDR:
3342         case MSR_K8_TSEG_MASK:
3343         case MSR_VM_HSAVE_PA:
3344         case MSR_K8_INT_PENDING_MSG:
3345         case MSR_AMD64_NB_CFG:
3346         case MSR_FAM10H_MMIO_CONF_BASE:
3347         case MSR_AMD64_BU_CFG2:
3348         case MSR_IA32_PERF_CTL:
3349         case MSR_AMD64_DC_CFG:
3350         case MSR_F15H_EX_CFG:
3351         /*
3352          * Intel Sandy Bridge CPUs must support the RAPL (running average power
3353          * limit) MSRs. Just return 0, as we do not want to expose the host
3354          * data here. Do not conditionalize this on CPUID, as KVM does not do
3355          * so for existing CPU-specific MSRs.
3356          */
3357         case MSR_RAPL_POWER_UNIT:
3358         case MSR_PP0_ENERGY_STATUS:     /* Power plane 0 (core) */
3359         case MSR_PP1_ENERGY_STATUS:     /* Power plane 1 (graphics uncore) */
3360         case MSR_PKG_ENERGY_STATUS:     /* Total package */
3361         case MSR_DRAM_ENERGY_STATUS:    /* DRAM controller */
3362                 msr_info->data = 0;
3363                 break;
3364         case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
3365         case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
3366         case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
3367         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
3368         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
3369                 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
3370                         return kvm_pmu_get_msr(vcpu, msr_info);
3371                 msr_info->data = 0;
3372                 break;
3373         case MSR_IA32_UCODE_REV:
3374                 msr_info->data = vcpu->arch.microcode_version;
3375                 break;
3376         case MSR_IA32_ARCH_CAPABILITIES:
3377                 if (!msr_info->host_initiated &&
3378                     !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
3379                         return 1;
3380                 msr_info->data = vcpu->arch.arch_capabilities;
3381                 break;
3382         case MSR_IA32_PERF_CAPABILITIES:
3383                 if (!msr_info->host_initiated &&
3384                     !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
3385                         return 1;
3386                 msr_info->data = vcpu->arch.perf_capabilities;
3387                 break;
3388         case MSR_IA32_POWER_CTL:
3389                 msr_info->data = vcpu->arch.msr_ia32_power_ctl;
3390                 break;
3391         case MSR_IA32_TSC: {
3392                 /*
3393                  * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
3394                  * even when not intercepted. AMD manual doesn't explicitly
3395                  * state this but appears to behave the same.
3396                  *
3397                  * On userspace reads and writes, however, we unconditionally
3398                  * return L1's TSC value to ensure backwards-compatible
3399                  * behavior for migration.
3400                  */
3401                 u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
3402                                                             vcpu->arch.tsc_offset;
3403
3404                 msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
3405                 break;
3406         }
3407         case MSR_MTRRcap:
3408         case 0x200 ... 0x2ff:
3409                 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
3410         case 0xcd: /* fsb frequency */
3411                 msr_info->data = 3;
3412                 break;
3413                 /*
3414                  * MSR_EBC_FREQUENCY_ID
3415                  * Conservative value valid for even the basic CPU models.
3416                  * Models 0,1: 000 in bits 23:21 indicating a bus speed of
3417                  * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
3418                  * and 266MHz for model 3, or 4. Set Core Clock
3419                  * Frequency to System Bus Frequency Ratio to 1 (bits
3420                  * 31:24) even though these are only valid for CPU
3421                  * models > 2, however guests may end up dividing or
3422                  * multiplying by zero otherwise.
3423                  */
3424         case MSR_EBC_FREQUENCY_ID:
3425                 msr_info->data = 1 << 24;
3426                 break;
3427         case MSR_IA32_APICBASE:
3428                 msr_info->data = kvm_get_apic_base(vcpu);
3429                 break;
3430         case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
3431                 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
3432         case MSR_IA32_TSCDEADLINE:
3433                 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
3434                 break;
3435         case MSR_IA32_TSC_ADJUST:
3436                 msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
3437                 break;
3438         case MSR_IA32_MISC_ENABLE:
3439                 msr_info->data = vcpu->arch.ia32_misc_enable_msr;
3440                 break;
3441         case MSR_IA32_SMBASE:
3442                 if (!msr_info->host_initiated)
3443                         return 1;
3444                 msr_info->data = vcpu->arch.smbase;
3445                 break;
3446         case MSR_SMI_COUNT:
3447                 msr_info->data = vcpu->arch.smi_count;
3448                 break;
3449         case MSR_IA32_PERF_STATUS:
3450                 /* TSC increment by tick */
3451                 msr_info->data = 1000ULL;
3452                 /* CPU multiplier */
3453                 msr_info->data |= (((uint64_t)4ULL) << 40);
3454                 break;
3455         case MSR_EFER:
3456                 msr_info->data = vcpu->arch.efer;
3457                 break;
3458         case MSR_KVM_WALL_CLOCK:
3459                 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3460                         return 1;
3461
3462                 msr_info->data = vcpu->kvm->arch.wall_clock;
3463                 break;
3464         case MSR_KVM_WALL_CLOCK_NEW:
3465                 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3466                         return 1;
3467
3468                 msr_info->data = vcpu->kvm->arch.wall_clock;
3469                 break;
3470         case MSR_KVM_SYSTEM_TIME:
3471                 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3472                         return 1;
3473
3474                 msr_info->data = vcpu->arch.time;
3475                 break;
3476         case MSR_KVM_SYSTEM_TIME_NEW:
3477                 if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3478                         return 1;
3479
3480                 msr_info->data = vcpu->arch.time;
3481                 break;
3482         case MSR_KVM_ASYNC_PF_EN:
3483                 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3484                         return 1;
3485
3486                 msr_info->data = vcpu->arch.apf.msr_en_val;
3487                 break;
3488         case MSR_KVM_ASYNC_PF_INT:
3489                 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3490                         return 1;
3491
3492                 msr_info->data = vcpu->arch.apf.msr_int_val;
3493                 break;
3494         case MSR_KVM_ASYNC_PF_ACK:
3495                 if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3496                         return 1;
3497
3498                 msr_info->data = 0;
3499                 break;
3500         case MSR_KVM_STEAL_TIME:
3501                 if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
3502                         return 1;
3503
3504                 msr_info->data = vcpu->arch.st.msr_val;
3505                 break;
3506         case MSR_KVM_PV_EOI_EN:
3507                 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
3508                         return 1;
3509
3510                 msr_info->data = vcpu->arch.pv_eoi.msr_val;
3511                 break;
3512         case MSR_KVM_POLL_CONTROL:
3513                 if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
3514                         return 1;
3515
3516                 msr_info->data = vcpu->arch.msr_kvm_poll_control;
3517                 break;
3518         case MSR_IA32_P5_MC_ADDR:
3519         case MSR_IA32_P5_MC_TYPE:
3520         case MSR_IA32_MCG_CAP:
3521         case MSR_IA32_MCG_CTL:
3522         case MSR_IA32_MCG_STATUS:
3523         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3524                 return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
3525                                    msr_info->host_initiated);
3526         case MSR_IA32_XSS:
3527                 if (!msr_info->host_initiated &&
3528                     !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3529                         return 1;
3530                 msr_info->data = vcpu->arch.ia32_xss;
3531                 break;
3532         case MSR_K7_CLK_CTL:
3533                 /*
3534                  * Provide expected ramp-up count for K7. All other
3535                  * are set to zero, indicating minimum divisors for
3536                  * every field.
3537                  *
3538                  * This prevents guest kernels on AMD host with CPU
3539                  * type 6, model 8 and higher from exploding due to
3540                  * the rdmsr failing.
3541                  */
3542                 msr_info->data = 0x20000000;
3543                 break;
3544         case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
3545         case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
3546         case HV_X64_MSR_SYNDBG_OPTIONS:
3547         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
3548         case HV_X64_MSR_CRASH_CTL:
3549         case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
3550         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
3551         case HV_X64_MSR_TSC_EMULATION_CONTROL:
3552         case HV_X64_MSR_TSC_EMULATION_STATUS:
3553                 return kvm_hv_get_msr_common(vcpu,
3554                                              msr_info->index, &msr_info->data,
3555                                              msr_info->host_initiated);
3556         case MSR_IA32_BBL_CR_CTL3:
3557                 /* This legacy MSR exists but isn't fully documented in current
3558                  * silicon.  It is however accessed by winxp in very narrow
3559                  * scenarios where it sets bit #19, itself documented as
3560                  * a "reserved" bit.  Best effort attempt to source coherent
3561                  * read data here should the balance of the register be
3562                  * interpreted by the guest:
3563                  *
3564                  * L2 cache control register 3: 64GB range, 256KB size,
3565                  * enabled, latency 0x1, configured
3566                  */
3567                 msr_info->data = 0xbe702111;
3568                 break;
3569         case MSR_AMD64_OSVW_ID_LENGTH:
3570                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3571                         return 1;
3572                 msr_info->data = vcpu->arch.osvw.length;
3573                 break;
3574         case MSR_AMD64_OSVW_STATUS:
3575                 if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3576                         return 1;
3577                 msr_info->data = vcpu->arch.osvw.status;
3578                 break;
3579         case MSR_PLATFORM_INFO:
3580                 if (!msr_info->host_initiated &&
3581                     !vcpu->kvm->arch.guest_can_read_msr_platform_info)
3582                         return 1;
3583                 msr_info->data = vcpu->arch.msr_platform_info;
3584                 break;
3585         case MSR_MISC_FEATURES_ENABLES:
3586                 msr_info->data = vcpu->arch.msr_misc_features_enables;
3587                 break;
3588         case MSR_K7_HWCR:
3589                 msr_info->data = vcpu->arch.msr_hwcr;
3590                 break;
3591         default:
3592                 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
3593                         return kvm_pmu_get_msr(vcpu, msr_info);
3594                 return KVM_MSR_RET_INVALID;
3595         }
3596         return 0;
3597 }
3598 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
3599
3600 /*
3601  * Read or write a bunch of msrs. All parameters are kernel addresses.
3602  *
3603  * @return number of msrs set successfully.
3604  */
3605 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
3606                     struct kvm_msr_entry *entries,
3607                     int (*do_msr)(struct kvm_vcpu *vcpu,
3608                                   unsigned index, u64 *data))
3609 {
3610         int i;
3611
3612         for (i = 0; i < msrs->nmsrs; ++i)
3613                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
3614                         break;
3615
3616         return i;
3617 }
3618
3619 /*
3620  * Read or write a bunch of msrs. Parameters are user addresses.
3621  *
3622  * @return number of msrs set successfully.
3623  */
3624 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
3625                   int (*do_msr)(struct kvm_vcpu *vcpu,
3626                                 unsigned index, u64 *data),
3627                   int writeback)
3628 {
3629         struct kvm_msrs msrs;
3630         struct kvm_msr_entry *entries;
3631         int r, n;
3632         unsigned size;
3633
3634         r = -EFAULT;
3635         if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
3636                 goto out;
3637
3638         r = -E2BIG;
3639         if (msrs.nmsrs >= MAX_IO_MSRS)
3640                 goto out;
3641
3642         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
3643         entries = memdup_user(user_msrs->entries, size);
3644         if (IS_ERR(entries)) {
3645                 r = PTR_ERR(entries);
3646                 goto out;
3647         }
3648
3649         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
3650         if (r < 0)
3651                 goto out_free;
3652
3653         r = -EFAULT;
3654         if (writeback && copy_to_user(user_msrs->entries, entries, size))
3655                 goto out_free;
3656
3657         r = n;
3658
3659 out_free:
3660         kfree(entries);
3661 out:
3662         return r;
3663 }
3664
3665 static inline bool kvm_can_mwait_in_guest(void)
3666 {
3667         return boot_cpu_has(X86_FEATURE_MWAIT) &&
3668                 !boot_cpu_has_bug(X86_BUG_MONITOR) &&
3669                 boot_cpu_has(X86_FEATURE_ARAT);
3670 }
3671
3672 static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
3673                                             struct kvm_cpuid2 __user *cpuid_arg)
3674 {
3675         struct kvm_cpuid2 cpuid;
3676         int r;
3677
3678         r = -EFAULT;
3679         if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
3680                 return r;
3681
3682         r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries);
3683         if (r)
3684                 return r;
3685
3686         r = -EFAULT;