KVM: SVM: Detect X2APIC virtualization (x2AVIC) support
[linux-2.6-microblaze.git] / arch / x86 / kvm / svm / svm.c
1 #define pr_fmt(fmt) "SVM: " fmt
2
3 #include <linux/kvm_host.h>
4
5 #include "irq.h"
6 #include "mmu.h"
7 #include "kvm_cache_regs.h"
8 #include "x86.h"
9 #include "cpuid.h"
10 #include "pmu.h"
11
12 #include <linux/module.h>
13 #include <linux/mod_devicetable.h>
14 #include <linux/kernel.h>
15 #include <linux/vmalloc.h>
16 #include <linux/highmem.h>
17 #include <linux/amd-iommu.h>
18 #include <linux/sched.h>
19 #include <linux/trace_events.h>
20 #include <linux/slab.h>
21 #include <linux/hashtable.h>
22 #include <linux/objtool.h>
23 #include <linux/psp-sev.h>
24 #include <linux/file.h>
25 #include <linux/pagemap.h>
26 #include <linux/swap.h>
27 #include <linux/rwsem.h>
28 #include <linux/cc_platform.h>
29
30 #include <asm/apic.h>
31 #include <asm/perf_event.h>
32 #include <asm/tlbflush.h>
33 #include <asm/desc.h>
34 #include <asm/debugreg.h>
35 #include <asm/kvm_para.h>
36 #include <asm/irq_remapping.h>
37 #include <asm/spec-ctrl.h>
38 #include <asm/cpu_device_id.h>
39 #include <asm/traps.h>
40 #include <asm/fpu/api.h>
41
42 #include <asm/virtext.h>
43 #include "trace.h"
44
45 #include "svm.h"
46 #include "svm_ops.h"
47
48 #include "kvm_onhyperv.h"
49 #include "svm_onhyperv.h"
50
51 MODULE_AUTHOR("Qumranet");
52 MODULE_LICENSE("GPL");
53
54 #ifdef MODULE
55 static const struct x86_cpu_id svm_cpu_id[] = {
56         X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL),
57         {}
58 };
59 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
60 #endif
61
62 #define SEG_TYPE_LDT 2
63 #define SEG_TYPE_BUSY_TSS16 3
64
65 static bool erratum_383_found __read_mostly;
66
67 u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
68
69 /*
70  * Set osvw_len to higher value when updated Revision Guides
71  * are published and we know what the new status bits are
72  */
73 static uint64_t osvw_len = 4, osvw_status;
74
75 static DEFINE_PER_CPU(u64, current_tsc_ratio);
76
77 static const struct svm_direct_access_msrs {
78         u32 index;   /* Index of the MSR */
79         bool always; /* True if intercept is initially cleared */
80 } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = {
81         { .index = MSR_STAR,                            .always = true  },
82         { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
83         { .index = MSR_IA32_SYSENTER_EIP,               .always = false },
84         { .index = MSR_IA32_SYSENTER_ESP,               .always = false },
85 #ifdef CONFIG_X86_64
86         { .index = MSR_GS_BASE,                         .always = true  },
87         { .index = MSR_FS_BASE,                         .always = true  },
88         { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
89         { .index = MSR_LSTAR,                           .always = true  },
90         { .index = MSR_CSTAR,                           .always = true  },
91         { .index = MSR_SYSCALL_MASK,                    .always = true  },
92 #endif
93         { .index = MSR_IA32_SPEC_CTRL,                  .always = false },
94         { .index = MSR_IA32_PRED_CMD,                   .always = false },
95         { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
96         { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
97         { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
98         { .index = MSR_IA32_LASTINTTOIP,                .always = false },
99         { .index = MSR_EFER,                            .always = false },
100         { .index = MSR_IA32_CR_PAT,                     .always = false },
101         { .index = MSR_AMD64_SEV_ES_GHCB,               .always = true  },
102         { .index = MSR_TSC_AUX,                         .always = false },
103         { .index = MSR_INVALID,                         .always = false },
104 };
105
106 /*
107  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
108  * pause_filter_count: On processors that support Pause filtering(indicated
109  *      by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
110  *      count value. On VMRUN this value is loaded into an internal counter.
111  *      Each time a pause instruction is executed, this counter is decremented
112  *      until it reaches zero at which time a #VMEXIT is generated if pause
113  *      intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
114  *      Intercept Filtering for more details.
115  *      This also indicate if ple logic enabled.
116  *
117  * pause_filter_thresh: In addition, some processor families support advanced
118  *      pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
119  *      the amount of time a guest is allowed to execute in a pause loop.
120  *      In this mode, a 16-bit pause filter threshold field is added in the
121  *      VMCB. The threshold value is a cycle count that is used to reset the
122  *      pause counter. As with simple pause filtering, VMRUN loads the pause
123  *      count value from VMCB into an internal counter. Then, on each pause
124  *      instruction the hardware checks the elapsed number of cycles since
125  *      the most recent pause instruction against the pause filter threshold.
126  *      If the elapsed cycle count is greater than the pause filter threshold,
127  *      then the internal pause count is reloaded from the VMCB and execution
128  *      continues. If the elapsed cycle count is less than the pause filter
129  *      threshold, then the internal pause count is decremented. If the count
130  *      value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
131  *      triggered. If advanced pause filtering is supported and pause filter
132  *      threshold field is set to zero, the filter will operate in the simpler,
133  *      count only mode.
134  */
135
136 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
137 module_param(pause_filter_thresh, ushort, 0444);
138
139 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
140 module_param(pause_filter_count, ushort, 0444);
141
142 /* Default doubles per-vcpu window every exit. */
143 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
144 module_param(pause_filter_count_grow, ushort, 0444);
145
146 /* Default resets per-vcpu window every exit to pause_filter_count. */
147 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
148 module_param(pause_filter_count_shrink, ushort, 0444);
149
150 /* Default is to compute the maximum so we can never overflow. */
151 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
152 module_param(pause_filter_count_max, ushort, 0444);
153
154 /*
155  * Use nested page tables by default.  Note, NPT may get forced off by
156  * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
157  */
158 bool npt_enabled = true;
159 module_param_named(npt, npt_enabled, bool, 0444);
160
161 /* allow nested virtualization in KVM/SVM */
162 static int nested = true;
163 module_param(nested, int, S_IRUGO);
164
165 /* enable/disable Next RIP Save */
166 static int nrips = true;
167 module_param(nrips, int, 0444);
168
169 /* enable/disable Virtual VMLOAD VMSAVE */
170 static int vls = true;
171 module_param(vls, int, 0444);
172
173 /* enable/disable Virtual GIF */
174 int vgif = true;
175 module_param(vgif, int, 0444);
176
177 /* enable/disable LBR virtualization */
178 static int lbrv = true;
179 module_param(lbrv, int, 0444);
180
181 static int tsc_scaling = true;
182 module_param(tsc_scaling, int, 0444);
183
184 /*
185  * enable / disable AVIC.  Because the defaults differ for APICv
186  * support between VMX and SVM we cannot use module_param_named.
187  */
188 static bool avic;
189 module_param(avic, bool, 0444);
190
191 bool __read_mostly dump_invalid_vmcb;
192 module_param(dump_invalid_vmcb, bool, 0644);
193
194
195 bool intercept_smi = true;
196 module_param(intercept_smi, bool, 0444);
197
198
199 static bool svm_gp_erratum_intercept = true;
200
201 static u8 rsm_ins_bytes[] = "\x0f\xaa";
202
203 static unsigned long iopm_base;
204
205 struct kvm_ldttss_desc {
206         u16 limit0;
207         u16 base0;
208         unsigned base1:8, type:5, dpl:2, p:1;
209         unsigned limit1:4, zero0:3, g:1, base2:8;
210         u32 base3;
211         u32 zero1;
212 } __attribute__((packed));
213
214 DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
215
216 /*
217  * Only MSR_TSC_AUX is switched via the user return hook.  EFER is switched via
218  * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
219  *
220  * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to
221  * defer the restoration of TSC_AUX until the CPU returns to userspace.
222  */
223 static int tsc_aux_uret_slot __read_mostly = -1;
224
225 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
226
227 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
228 #define MSRS_RANGE_SIZE 2048
229 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
230
231 u32 svm_msrpm_offset(u32 msr)
232 {
233         u32 offset;
234         int i;
235
236         for (i = 0; i < NUM_MSR_MAPS; i++) {
237                 if (msr < msrpm_ranges[i] ||
238                     msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
239                         continue;
240
241                 offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
242                 offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
243
244                 /* Now we have the u8 offset - but need the u32 offset */
245                 return offset / 4;
246         }
247
248         /* MSR not in any range */
249         return MSR_INVALID;
250 }
251
252 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu);
253
254 static int get_npt_level(void)
255 {
256 #ifdef CONFIG_X86_64
257         return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
258 #else
259         return PT32E_ROOT_LEVEL;
260 #endif
261 }
262
263 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
264 {
265         struct vcpu_svm *svm = to_svm(vcpu);
266         u64 old_efer = vcpu->arch.efer;
267         vcpu->arch.efer = efer;
268
269         if (!npt_enabled) {
270                 /* Shadow paging assumes NX to be available.  */
271                 efer |= EFER_NX;
272
273                 if (!(efer & EFER_LMA))
274                         efer &= ~EFER_LME;
275         }
276
277         if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
278                 if (!(efer & EFER_SVME)) {
279                         svm_leave_nested(vcpu);
280                         svm_set_gif(svm, true);
281                         /* #GP intercept is still needed for vmware backdoor */
282                         if (!enable_vmware_backdoor)
283                                 clr_exception_intercept(svm, GP_VECTOR);
284
285                         /*
286                          * Free the nested guest state, unless we are in SMM.
287                          * In this case we will return to the nested guest
288                          * as soon as we leave SMM.
289                          */
290                         if (!is_smm(vcpu))
291                                 svm_free_nested(svm);
292
293                 } else {
294                         int ret = svm_allocate_nested(svm);
295
296                         if (ret) {
297                                 vcpu->arch.efer = old_efer;
298                                 return ret;
299                         }
300
301                         /*
302                          * Never intercept #GP for SEV guests, KVM can't
303                          * decrypt guest memory to workaround the erratum.
304                          */
305                         if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
306                                 set_exception_intercept(svm, GP_VECTOR);
307                 }
308         }
309
310         svm->vmcb->save.efer = efer | EFER_SVME;
311         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
312         return 0;
313 }
314
315 static int is_external_interrupt(u32 info)
316 {
317         info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
318         return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
319 }
320
321 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
322 {
323         struct vcpu_svm *svm = to_svm(vcpu);
324         u32 ret = 0;
325
326         if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
327                 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
328         return ret;
329 }
330
331 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
332 {
333         struct vcpu_svm *svm = to_svm(vcpu);
334
335         if (mask == 0)
336                 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
337         else
338                 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
339
340 }
341
342 static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu,
343                                            bool commit_side_effects)
344 {
345         struct vcpu_svm *svm = to_svm(vcpu);
346         unsigned long old_rflags;
347
348         /*
349          * SEV-ES does not expose the next RIP. The RIP update is controlled by
350          * the type of exit and the #VC handler in the guest.
351          */
352         if (sev_es_guest(vcpu->kvm))
353                 goto done;
354
355         if (nrips && svm->vmcb->control.next_rip != 0) {
356                 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
357                 svm->next_rip = svm->vmcb->control.next_rip;
358         }
359
360         if (!svm->next_rip) {
361                 if (unlikely(!commit_side_effects))
362                         old_rflags = svm->vmcb->save.rflags;
363
364                 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
365                         return 0;
366
367                 if (unlikely(!commit_side_effects))
368                         svm->vmcb->save.rflags = old_rflags;
369         } else {
370                 kvm_rip_write(vcpu, svm->next_rip);
371         }
372
373 done:
374         if (likely(commit_side_effects))
375                 svm_set_interrupt_shadow(vcpu, 0);
376
377         return 1;
378 }
379
380 static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
381 {
382         return __svm_skip_emulated_instruction(vcpu, true);
383 }
384
385 static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
386 {
387         unsigned long rip, old_rip = kvm_rip_read(vcpu);
388         struct vcpu_svm *svm = to_svm(vcpu);
389
390         /*
391          * Due to architectural shortcomings, the CPU doesn't always provide
392          * NextRIP, e.g. if KVM intercepted an exception that occurred while
393          * the CPU was vectoring an INTO/INT3 in the guest.  Temporarily skip
394          * the instruction even if NextRIP is supported to acquire the next
395          * RIP so that it can be shoved into the NextRIP field, otherwise
396          * hardware will fail to advance guest RIP during event injection.
397          * Drop the exception/interrupt if emulation fails and effectively
398          * retry the instruction, it's the least awful option.  If NRIPS is
399          * in use, the skip must not commit any side effects such as clearing
400          * the interrupt shadow or RFLAGS.RF.
401          */
402         if (!__svm_skip_emulated_instruction(vcpu, !nrips))
403                 return -EIO;
404
405         rip = kvm_rip_read(vcpu);
406
407         /*
408          * Save the injection information, even when using next_rip, as the
409          * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection
410          * doesn't complete due to a VM-Exit occurring while the CPU is
411          * vectoring the event.   Decoding the instruction isn't guaranteed to
412          * work as there may be no backing instruction, e.g. if the event is
413          * being injected by L1 for L2, or if the guest is patching INT3 into
414          * a different instruction.
415          */
416         svm->soft_int_injected = true;
417         svm->soft_int_csbase = svm->vmcb->save.cs.base;
418         svm->soft_int_old_rip = old_rip;
419         svm->soft_int_next_rip = rip;
420
421         if (nrips)
422                 kvm_rip_write(vcpu, old_rip);
423
424         if (static_cpu_has(X86_FEATURE_NRIPS))
425                 svm->vmcb->control.next_rip = rip;
426
427         return 0;
428 }
429
430 static void svm_queue_exception(struct kvm_vcpu *vcpu)
431 {
432         struct vcpu_svm *svm = to_svm(vcpu);
433         unsigned nr = vcpu->arch.exception.nr;
434         bool has_error_code = vcpu->arch.exception.has_error_code;
435         u32 error_code = vcpu->arch.exception.error_code;
436
437         kvm_deliver_exception_payload(vcpu);
438
439         if (kvm_exception_is_soft(nr) &&
440             svm_update_soft_interrupt_rip(vcpu))
441                 return;
442
443         svm->vmcb->control.event_inj = nr
444                 | SVM_EVTINJ_VALID
445                 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
446                 | SVM_EVTINJ_TYPE_EXEPT;
447         svm->vmcb->control.event_inj_err = error_code;
448 }
449
450 static void svm_init_erratum_383(void)
451 {
452         u32 low, high;
453         int err;
454         u64 val;
455
456         if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
457                 return;
458
459         /* Use _safe variants to not break nested virtualization */
460         val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
461         if (err)
462                 return;
463
464         val |= (1ULL << 47);
465
466         low  = lower_32_bits(val);
467         high = upper_32_bits(val);
468
469         native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
470
471         erratum_383_found = true;
472 }
473
474 static void svm_init_osvw(struct kvm_vcpu *vcpu)
475 {
476         /*
477          * Guests should see errata 400 and 415 as fixed (assuming that
478          * HLT and IO instructions are intercepted).
479          */
480         vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
481         vcpu->arch.osvw.status = osvw_status & ~(6ULL);
482
483         /*
484          * By increasing VCPU's osvw.length to 3 we are telling the guest that
485          * all osvw.status bits inside that length, including bit 0 (which is
486          * reserved for erratum 298), are valid. However, if host processor's
487          * osvw_len is 0 then osvw_status[0] carries no information. We need to
488          * be conservative here and therefore we tell the guest that erratum 298
489          * is present (because we really don't know).
490          */
491         if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
492                 vcpu->arch.osvw.status |= 1;
493 }
494
495 static int has_svm(void)
496 {
497         const char *msg;
498
499         if (!cpu_has_svm(&msg)) {
500                 printk(KERN_INFO "has_svm: %s\n", msg);
501                 return 0;
502         }
503
504         if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
505                 pr_info("KVM is unsupported when running as an SEV guest\n");
506                 return 0;
507         }
508
509         return 1;
510 }
511
512 void __svm_write_tsc_multiplier(u64 multiplier)
513 {
514         preempt_disable();
515
516         if (multiplier == __this_cpu_read(current_tsc_ratio))
517                 goto out;
518
519         wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
520         __this_cpu_write(current_tsc_ratio, multiplier);
521 out:
522         preempt_enable();
523 }
524
525 static void svm_hardware_disable(void)
526 {
527         /* Make sure we clean up behind us */
528         if (tsc_scaling)
529                 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
530
531         cpu_svm_disable();
532
533         amd_pmu_disable_virt();
534 }
535
536 static int svm_hardware_enable(void)
537 {
538
539         struct svm_cpu_data *sd;
540         uint64_t efer;
541         struct desc_struct *gdt;
542         int me = raw_smp_processor_id();
543
544         rdmsrl(MSR_EFER, efer);
545         if (efer & EFER_SVME)
546                 return -EBUSY;
547
548         if (!has_svm()) {
549                 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
550                 return -EINVAL;
551         }
552         sd = per_cpu(svm_data, me);
553         if (!sd) {
554                 pr_err("%s: svm_data is NULL on %d\n", __func__, me);
555                 return -EINVAL;
556         }
557
558         sd->asid_generation = 1;
559         sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
560         sd->next_asid = sd->max_asid + 1;
561         sd->min_asid = max_sev_asid + 1;
562
563         gdt = get_current_gdt_rw();
564         sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
565
566         wrmsrl(MSR_EFER, efer | EFER_SVME);
567
568         wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area));
569
570         if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
571                 /*
572                  * Set the default value, even if we don't use TSC scaling
573                  * to avoid having stale value in the msr
574                  */
575                 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
576         }
577
578
579         /*
580          * Get OSVW bits.
581          *
582          * Note that it is possible to have a system with mixed processor
583          * revisions and therefore different OSVW bits. If bits are not the same
584          * on different processors then choose the worst case (i.e. if erratum
585          * is present on one processor and not on another then assume that the
586          * erratum is present everywhere).
587          */
588         if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
589                 uint64_t len, status = 0;
590                 int err;
591
592                 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
593                 if (!err)
594                         status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
595                                                       &err);
596
597                 if (err)
598                         osvw_status = osvw_len = 0;
599                 else {
600                         if (len < osvw_len)
601                                 osvw_len = len;
602                         osvw_status |= status;
603                         osvw_status &= (1ULL << osvw_len) - 1;
604                 }
605         } else
606                 osvw_status = osvw_len = 0;
607
608         svm_init_erratum_383();
609
610         amd_pmu_enable_virt();
611
612         return 0;
613 }
614
615 static void svm_cpu_uninit(int cpu)
616 {
617         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
618
619         if (!sd)
620                 return;
621
622         per_cpu(svm_data, cpu) = NULL;
623         kfree(sd->sev_vmcbs);
624         __free_page(sd->save_area);
625         kfree(sd);
626 }
627
628 static int svm_cpu_init(int cpu)
629 {
630         struct svm_cpu_data *sd;
631         int ret = -ENOMEM;
632
633         sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
634         if (!sd)
635                 return ret;
636         sd->cpu = cpu;
637         sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
638         if (!sd->save_area)
639                 goto free_cpu_data;
640
641         ret = sev_cpu_init(sd);
642         if (ret)
643                 goto free_save_area;
644
645         per_cpu(svm_data, cpu) = sd;
646
647         return 0;
648
649 free_save_area:
650         __free_page(sd->save_area);
651 free_cpu_data:
652         kfree(sd);
653         return ret;
654
655 }
656
657 static int direct_access_msr_slot(u32 msr)
658 {
659         u32 i;
660
661         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
662                 if (direct_access_msrs[i].index == msr)
663                         return i;
664
665         return -ENOENT;
666 }
667
668 static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read,
669                                      int write)
670 {
671         struct vcpu_svm *svm = to_svm(vcpu);
672         int slot = direct_access_msr_slot(msr);
673
674         if (slot == -ENOENT)
675                 return;
676
677         /* Set the shadow bitmaps to the desired intercept states */
678         if (read)
679                 set_bit(slot, svm->shadow_msr_intercept.read);
680         else
681                 clear_bit(slot, svm->shadow_msr_intercept.read);
682
683         if (write)
684                 set_bit(slot, svm->shadow_msr_intercept.write);
685         else
686                 clear_bit(slot, svm->shadow_msr_intercept.write);
687 }
688
689 static bool valid_msr_intercept(u32 index)
690 {
691         return direct_access_msr_slot(index) != -ENOENT;
692 }
693
694 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
695 {
696         u8 bit_write;
697         unsigned long tmp;
698         u32 offset;
699         u32 *msrpm;
700
701         msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
702                                       to_svm(vcpu)->msrpm;
703
704         offset    = svm_msrpm_offset(msr);
705         bit_write = 2 * (msr & 0x0f) + 1;
706         tmp       = msrpm[offset];
707
708         BUG_ON(offset == MSR_INVALID);
709
710         return !!test_bit(bit_write,  &tmp);
711 }
712
713 static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
714                                         u32 msr, int read, int write)
715 {
716         struct vcpu_svm *svm = to_svm(vcpu);
717         u8 bit_read, bit_write;
718         unsigned long tmp;
719         u32 offset;
720
721         /*
722          * If this warning triggers extend the direct_access_msrs list at the
723          * beginning of the file
724          */
725         WARN_ON(!valid_msr_intercept(msr));
726
727         /* Enforce non allowed MSRs to trap */
728         if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ))
729                 read = 0;
730
731         if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE))
732                 write = 0;
733
734         offset    = svm_msrpm_offset(msr);
735         bit_read  = 2 * (msr & 0x0f);
736         bit_write = 2 * (msr & 0x0f) + 1;
737         tmp       = msrpm[offset];
738
739         BUG_ON(offset == MSR_INVALID);
740
741         read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
742         write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
743
744         msrpm[offset] = tmp;
745
746         svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
747         svm->nested.force_msr_bitmap_recalc = true;
748 }
749
750 void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
751                           int read, int write)
752 {
753         set_shadow_msr_intercept(vcpu, msr, read, write);
754         set_msr_interception_bitmap(vcpu, msrpm, msr, read, write);
755 }
756
757 u32 *svm_vcpu_alloc_msrpm(void)
758 {
759         unsigned int order = get_order(MSRPM_SIZE);
760         struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order);
761         u32 *msrpm;
762
763         if (!pages)
764                 return NULL;
765
766         msrpm = page_address(pages);
767         memset(msrpm, 0xff, PAGE_SIZE * (1 << order));
768
769         return msrpm;
770 }
771
772 void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm)
773 {
774         int i;
775
776         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
777                 if (!direct_access_msrs[i].always)
778                         continue;
779                 set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1);
780         }
781 }
782
783
784 void svm_vcpu_free_msrpm(u32 *msrpm)
785 {
786         __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE));
787 }
788
789 static void svm_msr_filter_changed(struct kvm_vcpu *vcpu)
790 {
791         struct vcpu_svm *svm = to_svm(vcpu);
792         u32 i;
793
794         /*
795          * Set intercept permissions for all direct access MSRs again. They
796          * will automatically get filtered through the MSR filter, so we are
797          * back in sync after this.
798          */
799         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
800                 u32 msr = direct_access_msrs[i].index;
801                 u32 read = test_bit(i, svm->shadow_msr_intercept.read);
802                 u32 write = test_bit(i, svm->shadow_msr_intercept.write);
803
804                 set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write);
805         }
806 }
807
808 static void add_msr_offset(u32 offset)
809 {
810         int i;
811
812         for (i = 0; i < MSRPM_OFFSETS; ++i) {
813
814                 /* Offset already in list? */
815                 if (msrpm_offsets[i] == offset)
816                         return;
817
818                 /* Slot used by another offset? */
819                 if (msrpm_offsets[i] != MSR_INVALID)
820                         continue;
821
822                 /* Add offset to list */
823                 msrpm_offsets[i] = offset;
824
825                 return;
826         }
827
828         /*
829          * If this BUG triggers the msrpm_offsets table has an overflow. Just
830          * increase MSRPM_OFFSETS in this case.
831          */
832         BUG();
833 }
834
835 static void init_msrpm_offsets(void)
836 {
837         int i;
838
839         memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
840
841         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
842                 u32 offset;
843
844                 offset = svm_msrpm_offset(direct_access_msrs[i].index);
845                 BUG_ON(offset == MSR_INVALID);
846
847                 add_msr_offset(offset);
848         }
849 }
850
851 void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
852 {
853         to_vmcb->save.dbgctl            = from_vmcb->save.dbgctl;
854         to_vmcb->save.br_from           = from_vmcb->save.br_from;
855         to_vmcb->save.br_to             = from_vmcb->save.br_to;
856         to_vmcb->save.last_excp_from    = from_vmcb->save.last_excp_from;
857         to_vmcb->save.last_excp_to      = from_vmcb->save.last_excp_to;
858
859         vmcb_mark_dirty(to_vmcb, VMCB_LBR);
860 }
861
862 static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
863 {
864         struct vcpu_svm *svm = to_svm(vcpu);
865
866         svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
867         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
868         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
869         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
870         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
871
872         /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
873         if (is_guest_mode(vcpu))
874                 svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
875 }
876
877 static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
878 {
879         struct vcpu_svm *svm = to_svm(vcpu);
880
881         svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
882         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
883         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
884         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
885         set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
886
887         /*
888          * Move the LBR msrs back to the vmcb01 to avoid copying them
889          * on nested guest entries.
890          */
891         if (is_guest_mode(vcpu))
892                 svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
893 }
894
895 static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
896 {
897         /*
898          * If the LBR virtualization is disabled, the LBR msrs are always
899          * kept in the vmcb01 to avoid copying them on nested guest entries.
900          *
901          * If nested, and the LBR virtualization is enabled/disabled, the msrs
902          * are moved between the vmcb01 and vmcb02 as needed.
903          */
904         struct vmcb *vmcb =
905                 (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
906                         svm->vmcb : svm->vmcb01.ptr;
907
908         switch (index) {
909         case MSR_IA32_DEBUGCTLMSR:
910                 return vmcb->save.dbgctl;
911         case MSR_IA32_LASTBRANCHFROMIP:
912                 return vmcb->save.br_from;
913         case MSR_IA32_LASTBRANCHTOIP:
914                 return vmcb->save.br_to;
915         case MSR_IA32_LASTINTFROMIP:
916                 return vmcb->save.last_excp_from;
917         case MSR_IA32_LASTINTTOIP:
918                 return vmcb->save.last_excp_to;
919         default:
920                 KVM_BUG(false, svm->vcpu.kvm,
921                         "%s: Unknown MSR 0x%x", __func__, index);
922                 return 0;
923         }
924 }
925
926 void svm_update_lbrv(struct kvm_vcpu *vcpu)
927 {
928         struct vcpu_svm *svm = to_svm(vcpu);
929
930         bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
931                                            DEBUGCTLMSR_LBR;
932
933         bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
934                                       LBR_CTL_ENABLE_MASK);
935
936         if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
937                 if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
938                         enable_lbrv = true;
939
940         if (enable_lbrv == current_enable_lbrv)
941                 return;
942
943         if (enable_lbrv)
944                 svm_enable_lbrv(vcpu);
945         else
946                 svm_disable_lbrv(vcpu);
947 }
948
949 void disable_nmi_singlestep(struct vcpu_svm *svm)
950 {
951         svm->nmi_singlestep = false;
952
953         if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
954                 /* Clear our flags if they were not set by the guest */
955                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
956                         svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
957                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
958                         svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
959         }
960 }
961
962 static void grow_ple_window(struct kvm_vcpu *vcpu)
963 {
964         struct vcpu_svm *svm = to_svm(vcpu);
965         struct vmcb_control_area *control = &svm->vmcb->control;
966         int old = control->pause_filter_count;
967
968         if (kvm_pause_in_guest(vcpu->kvm))
969                 return;
970
971         control->pause_filter_count = __grow_ple_window(old,
972                                                         pause_filter_count,
973                                                         pause_filter_count_grow,
974                                                         pause_filter_count_max);
975
976         if (control->pause_filter_count != old) {
977                 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
978                 trace_kvm_ple_window_update(vcpu->vcpu_id,
979                                             control->pause_filter_count, old);
980         }
981 }
982
983 static void shrink_ple_window(struct kvm_vcpu *vcpu)
984 {
985         struct vcpu_svm *svm = to_svm(vcpu);
986         struct vmcb_control_area *control = &svm->vmcb->control;
987         int old = control->pause_filter_count;
988
989         if (kvm_pause_in_guest(vcpu->kvm))
990                 return;
991
992         control->pause_filter_count =
993                                 __shrink_ple_window(old,
994                                                     pause_filter_count,
995                                                     pause_filter_count_shrink,
996                                                     pause_filter_count);
997         if (control->pause_filter_count != old) {
998                 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
999                 trace_kvm_ple_window_update(vcpu->vcpu_id,
1000                                             control->pause_filter_count, old);
1001         }
1002 }
1003
1004 static void svm_hardware_unsetup(void)
1005 {
1006         int cpu;
1007
1008         sev_hardware_unsetup();
1009
1010         for_each_possible_cpu(cpu)
1011                 svm_cpu_uninit(cpu);
1012
1013         __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
1014         get_order(IOPM_SIZE));
1015         iopm_base = 0;
1016 }
1017
1018 static void init_seg(struct vmcb_seg *seg)
1019 {
1020         seg->selector = 0;
1021         seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1022                       SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1023         seg->limit = 0xffff;
1024         seg->base = 0;
1025 }
1026
1027 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1028 {
1029         seg->selector = 0;
1030         seg->attrib = SVM_SELECTOR_P_MASK | type;
1031         seg->limit = 0xffff;
1032         seg->base = 0;
1033 }
1034
1035 static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1036 {
1037         struct vcpu_svm *svm = to_svm(vcpu);
1038
1039         return svm->nested.ctl.tsc_offset;
1040 }
1041
1042 static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1043 {
1044         struct vcpu_svm *svm = to_svm(vcpu);
1045
1046         return svm->tsc_ratio_msr;
1047 }
1048
1049 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1050 {
1051         struct vcpu_svm *svm = to_svm(vcpu);
1052
1053         svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
1054         svm->vmcb->control.tsc_offset = offset;
1055         vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1056 }
1057
1058 static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
1059 {
1060         __svm_write_tsc_multiplier(multiplier);
1061 }
1062
1063
1064 /* Evaluate instruction intercepts that depend on guest CPUID features. */
1065 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
1066                                               struct vcpu_svm *svm)
1067 {
1068         /*
1069          * Intercept INVPCID if shadow paging is enabled to sync/free shadow
1070          * roots, or if INVPCID is disabled in the guest to inject #UD.
1071          */
1072         if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
1073                 if (!npt_enabled ||
1074                     !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
1075                         svm_set_intercept(svm, INTERCEPT_INVPCID);
1076                 else
1077                         svm_clr_intercept(svm, INTERCEPT_INVPCID);
1078         }
1079
1080         if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) {
1081                 if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
1082                         svm_clr_intercept(svm, INTERCEPT_RDTSCP);
1083                 else
1084                         svm_set_intercept(svm, INTERCEPT_RDTSCP);
1085         }
1086 }
1087
1088 static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
1089 {
1090         struct vcpu_svm *svm = to_svm(vcpu);
1091
1092         if (guest_cpuid_is_intel(vcpu)) {
1093                 /*
1094                  * We must intercept SYSENTER_EIP and SYSENTER_ESP
1095                  * accesses because the processor only stores 32 bits.
1096                  * For the same reason we cannot use virtual VMLOAD/VMSAVE.
1097                  */
1098                 svm_set_intercept(svm, INTERCEPT_VMLOAD);
1099                 svm_set_intercept(svm, INTERCEPT_VMSAVE);
1100                 svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1101
1102                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
1103                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
1104
1105                 svm->v_vmload_vmsave_enabled = false;
1106         } else {
1107                 /*
1108                  * If hardware supports Virtual VMLOAD VMSAVE then enable it
1109                  * in VMCB and clear intercepts to avoid #VMEXIT.
1110                  */
1111                 if (vls) {
1112                         svm_clr_intercept(svm, INTERCEPT_VMLOAD);
1113                         svm_clr_intercept(svm, INTERCEPT_VMSAVE);
1114                         svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1115                 }
1116                 /* No need to intercept these MSRs */
1117                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
1118                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
1119         }
1120 }
1121
1122 static void init_vmcb(struct kvm_vcpu *vcpu)
1123 {
1124         struct vcpu_svm *svm = to_svm(vcpu);
1125         struct vmcb *vmcb = svm->vmcb01.ptr;
1126         struct vmcb_control_area *control = &vmcb->control;
1127         struct vmcb_save_area *save = &vmcb->save;
1128
1129         svm_set_intercept(svm, INTERCEPT_CR0_READ);
1130         svm_set_intercept(svm, INTERCEPT_CR3_READ);
1131         svm_set_intercept(svm, INTERCEPT_CR4_READ);
1132         svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1133         svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
1134         svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
1135         if (!kvm_vcpu_apicv_active(vcpu))
1136                 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
1137
1138         set_dr_intercepts(svm);
1139
1140         set_exception_intercept(svm, PF_VECTOR);
1141         set_exception_intercept(svm, UD_VECTOR);
1142         set_exception_intercept(svm, MC_VECTOR);
1143         set_exception_intercept(svm, AC_VECTOR);
1144         set_exception_intercept(svm, DB_VECTOR);
1145         /*
1146          * Guest access to VMware backdoor ports could legitimately
1147          * trigger #GP because of TSS I/O permission bitmap.
1148          * We intercept those #GP and allow access to them anyway
1149          * as VMware does.  Don't intercept #GP for SEV guests as KVM can't
1150          * decrypt guest memory to decode the faulting instruction.
1151          */
1152         if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
1153                 set_exception_intercept(svm, GP_VECTOR);
1154
1155         svm_set_intercept(svm, INTERCEPT_INTR);
1156         svm_set_intercept(svm, INTERCEPT_NMI);
1157
1158         if (intercept_smi)
1159                 svm_set_intercept(svm, INTERCEPT_SMI);
1160
1161         svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1162         svm_set_intercept(svm, INTERCEPT_RDPMC);
1163         svm_set_intercept(svm, INTERCEPT_CPUID);
1164         svm_set_intercept(svm, INTERCEPT_INVD);
1165         svm_set_intercept(svm, INTERCEPT_INVLPG);
1166         svm_set_intercept(svm, INTERCEPT_INVLPGA);
1167         svm_set_intercept(svm, INTERCEPT_IOIO_PROT);
1168         svm_set_intercept(svm, INTERCEPT_MSR_PROT);
1169         svm_set_intercept(svm, INTERCEPT_TASK_SWITCH);
1170         svm_set_intercept(svm, INTERCEPT_SHUTDOWN);
1171         svm_set_intercept(svm, INTERCEPT_VMRUN);
1172         svm_set_intercept(svm, INTERCEPT_VMMCALL);
1173         svm_set_intercept(svm, INTERCEPT_VMLOAD);
1174         svm_set_intercept(svm, INTERCEPT_VMSAVE);
1175         svm_set_intercept(svm, INTERCEPT_STGI);
1176         svm_set_intercept(svm, INTERCEPT_CLGI);
1177         svm_set_intercept(svm, INTERCEPT_SKINIT);
1178         svm_set_intercept(svm, INTERCEPT_WBINVD);
1179         svm_set_intercept(svm, INTERCEPT_XSETBV);
1180         svm_set_intercept(svm, INTERCEPT_RDPRU);
1181         svm_set_intercept(svm, INTERCEPT_RSM);
1182
1183         if (!kvm_mwait_in_guest(vcpu->kvm)) {
1184                 svm_set_intercept(svm, INTERCEPT_MONITOR);
1185                 svm_set_intercept(svm, INTERCEPT_MWAIT);
1186         }
1187
1188         if (!kvm_hlt_in_guest(vcpu->kvm))
1189                 svm_set_intercept(svm, INTERCEPT_HLT);
1190
1191         control->iopm_base_pa = __sme_set(iopm_base);
1192         control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1193         control->int_ctl = V_INTR_MASKING_MASK;
1194
1195         init_seg(&save->es);
1196         init_seg(&save->ss);
1197         init_seg(&save->ds);
1198         init_seg(&save->fs);
1199         init_seg(&save->gs);
1200
1201         save->cs.selector = 0xf000;
1202         save->cs.base = 0xffff0000;
1203         /* Executable/Readable Code Segment */
1204         save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1205                 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1206         save->cs.limit = 0xffff;
1207
1208         save->gdtr.base = 0;
1209         save->gdtr.limit = 0xffff;
1210         save->idtr.base = 0;
1211         save->idtr.limit = 0xffff;
1212
1213         init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1214         init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1215
1216         if (npt_enabled) {
1217                 /* Setup VMCB for Nested Paging */
1218                 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1219                 svm_clr_intercept(svm, INTERCEPT_INVLPG);
1220                 clr_exception_intercept(svm, PF_VECTOR);
1221                 svm_clr_intercept(svm, INTERCEPT_CR3_READ);
1222                 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
1223                 save->g_pat = vcpu->arch.pat;
1224                 save->cr3 = 0;
1225         }
1226         svm->current_vmcb->asid_generation = 0;
1227         svm->asid = 0;
1228
1229         svm->nested.vmcb12_gpa = INVALID_GPA;
1230         svm->nested.last_vmcb12_gpa = INVALID_GPA;
1231
1232         if (!kvm_pause_in_guest(vcpu->kvm)) {
1233                 control->pause_filter_count = pause_filter_count;
1234                 if (pause_filter_thresh)
1235                         control->pause_filter_thresh = pause_filter_thresh;
1236                 svm_set_intercept(svm, INTERCEPT_PAUSE);
1237         } else {
1238                 svm_clr_intercept(svm, INTERCEPT_PAUSE);
1239         }
1240
1241         svm_recalc_instruction_intercepts(vcpu, svm);
1242
1243         /*
1244          * If the host supports V_SPEC_CTRL then disable the interception
1245          * of MSR_IA32_SPEC_CTRL.
1246          */
1247         if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
1248                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1249
1250         if (kvm_vcpu_apicv_active(vcpu))
1251                 avic_init_vmcb(svm, vmcb);
1252
1253         if (vgif) {
1254                 svm_clr_intercept(svm, INTERCEPT_STGI);
1255                 svm_clr_intercept(svm, INTERCEPT_CLGI);
1256                 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1257         }
1258
1259         if (sev_guest(vcpu->kvm)) {
1260                 svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
1261                 clr_exception_intercept(svm, UD_VECTOR);
1262
1263                 if (sev_es_guest(vcpu->kvm)) {
1264                         /* Perform SEV-ES specific VMCB updates */
1265                         sev_es_init_vmcb(svm);
1266                 }
1267         }
1268
1269         svm_hv_init_vmcb(vmcb);
1270         init_vmcb_after_set_cpuid(vcpu);
1271
1272         vmcb_mark_all_dirty(vmcb);
1273
1274         enable_gif(svm);
1275 }
1276
1277 static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
1278 {
1279         struct vcpu_svm *svm = to_svm(vcpu);
1280
1281         svm_vcpu_init_msrpm(vcpu, svm->msrpm);
1282
1283         svm_init_osvw(vcpu);
1284         vcpu->arch.microcode_version = 0x01000065;
1285         svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
1286
1287         if (sev_es_guest(vcpu->kvm))
1288                 sev_es_vcpu_reset(svm);
1289 }
1290
1291 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1292 {
1293         struct vcpu_svm *svm = to_svm(vcpu);
1294
1295         svm->spec_ctrl = 0;
1296         svm->virt_spec_ctrl = 0;
1297
1298         init_vmcb(vcpu);
1299
1300         if (!init_event)
1301                 __svm_vcpu_reset(vcpu);
1302 }
1303
1304 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
1305 {
1306         svm->current_vmcb = target_vmcb;
1307         svm->vmcb = target_vmcb->ptr;
1308 }
1309
1310 static int svm_vcpu_create(struct kvm_vcpu *vcpu)
1311 {
1312         struct vcpu_svm *svm;
1313         struct page *vmcb01_page;
1314         struct page *vmsa_page = NULL;
1315         int err;
1316
1317         BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
1318         svm = to_svm(vcpu);
1319
1320         err = -ENOMEM;
1321         vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1322         if (!vmcb01_page)
1323                 goto out;
1324
1325         if (sev_es_guest(vcpu->kvm)) {
1326                 /*
1327                  * SEV-ES guests require a separate VMSA page used to contain
1328                  * the encrypted register state of the guest.
1329                  */
1330                 vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1331                 if (!vmsa_page)
1332                         goto error_free_vmcb_page;
1333
1334                 /*
1335                  * SEV-ES guests maintain an encrypted version of their FPU
1336                  * state which is restored and saved on VMRUN and VMEXIT.
1337                  * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
1338                  * do xsave/xrstor on it.
1339                  */
1340                 fpstate_set_confidential(&vcpu->arch.guest_fpu);
1341         }
1342
1343         err = avic_init_vcpu(svm);
1344         if (err)
1345                 goto error_free_vmsa_page;
1346
1347         svm->msrpm = svm_vcpu_alloc_msrpm();
1348         if (!svm->msrpm) {
1349                 err = -ENOMEM;
1350                 goto error_free_vmsa_page;
1351         }
1352
1353         svm->vmcb01.ptr = page_address(vmcb01_page);
1354         svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
1355         svm_switch_vmcb(svm, &svm->vmcb01);
1356
1357         if (vmsa_page)
1358                 svm->sev_es.vmsa = page_address(vmsa_page);
1359
1360         svm->guest_state_loaded = false;
1361
1362         return 0;
1363
1364 error_free_vmsa_page:
1365         if (vmsa_page)
1366                 __free_page(vmsa_page);
1367 error_free_vmcb_page:
1368         __free_page(vmcb01_page);
1369 out:
1370         return err;
1371 }
1372
1373 static void svm_clear_current_vmcb(struct vmcb *vmcb)
1374 {
1375         int i;
1376
1377         for_each_online_cpu(i)
1378                 cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
1379 }
1380
1381 static void svm_vcpu_free(struct kvm_vcpu *vcpu)
1382 {
1383         struct vcpu_svm *svm = to_svm(vcpu);
1384
1385         /*
1386          * The vmcb page can be recycled, causing a false negative in
1387          * svm_vcpu_load(). So, ensure that no logical CPU has this
1388          * vmcb page recorded as its current vmcb.
1389          */
1390         svm_clear_current_vmcb(svm->vmcb);
1391
1392         svm_free_nested(svm);
1393
1394         sev_free_vcpu(vcpu);
1395
1396         __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
1397         __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
1398 }
1399
1400 static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1401 {
1402         struct vcpu_svm *svm = to_svm(vcpu);
1403         struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
1404
1405         if (sev_es_guest(vcpu->kvm))
1406                 sev_es_unmap_ghcb(svm);
1407
1408         if (svm->guest_state_loaded)
1409                 return;
1410
1411         /*
1412          * Save additional host state that will be restored on VMEXIT (sev-es)
1413          * or subsequent vmload of host save area.
1414          */
1415         vmsave(__sme_page_pa(sd->save_area));
1416         if (sev_es_guest(vcpu->kvm)) {
1417                 struct sev_es_save_area *hostsa;
1418                 hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400);
1419
1420                 sev_es_prepare_switch_to_guest(hostsa);
1421         }
1422
1423         if (tsc_scaling)
1424                 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1425
1426         if (likely(tsc_aux_uret_slot >= 0))
1427                 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
1428
1429         svm->guest_state_loaded = true;
1430 }
1431
1432 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
1433 {
1434         to_svm(vcpu)->guest_state_loaded = false;
1435 }
1436
1437 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1438 {
1439         struct vcpu_svm *svm = to_svm(vcpu);
1440         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
1441
1442         if (sd->current_vmcb != svm->vmcb) {
1443                 sd->current_vmcb = svm->vmcb;
1444                 indirect_branch_prediction_barrier();
1445         }
1446         if (kvm_vcpu_apicv_active(vcpu))
1447                 avic_vcpu_load(vcpu, cpu);
1448 }
1449
1450 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
1451 {
1452         if (kvm_vcpu_apicv_active(vcpu))
1453                 avic_vcpu_put(vcpu);
1454
1455         svm_prepare_host_switch(vcpu);
1456
1457         ++vcpu->stat.host_state_reload;
1458 }
1459
1460 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
1461 {
1462         struct vcpu_svm *svm = to_svm(vcpu);
1463         unsigned long rflags = svm->vmcb->save.rflags;
1464
1465         if (svm->nmi_singlestep) {
1466                 /* Hide our flags if they were not set by the guest */
1467                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1468                         rflags &= ~X86_EFLAGS_TF;
1469                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1470                         rflags &= ~X86_EFLAGS_RF;
1471         }
1472         return rflags;
1473 }
1474
1475 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
1476 {
1477         if (to_svm(vcpu)->nmi_singlestep)
1478                 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
1479
1480        /*
1481         * Any change of EFLAGS.VM is accompanied by a reload of SS
1482         * (caused by either a task switch or an inter-privilege IRET),
1483         * so we do not need to update the CPL here.
1484         */
1485         to_svm(vcpu)->vmcb->save.rflags = rflags;
1486 }
1487
1488 static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
1489 {
1490         struct vmcb *vmcb = to_svm(vcpu)->vmcb;
1491
1492         return sev_es_guest(vcpu->kvm)
1493                 ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
1494                 : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
1495 }
1496
1497 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1498 {
1499         kvm_register_mark_available(vcpu, reg);
1500
1501         switch (reg) {
1502         case VCPU_EXREG_PDPTR:
1503                 /*
1504                  * When !npt_enabled, mmu->pdptrs[] is already available since
1505                  * it is always updated per SDM when moving to CRs.
1506                  */
1507                 if (npt_enabled)
1508                         load_pdptrs(vcpu, kvm_read_cr3(vcpu));
1509                 break;
1510         default:
1511                 KVM_BUG_ON(1, vcpu->kvm);
1512         }
1513 }
1514
1515 static void svm_set_vintr(struct vcpu_svm *svm)
1516 {
1517         struct vmcb_control_area *control;
1518
1519         /*
1520          * The following fields are ignored when AVIC is enabled
1521          */
1522         WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
1523
1524         svm_set_intercept(svm, INTERCEPT_VINTR);
1525
1526         /*
1527          * This is just a dummy VINTR to actually cause a vmexit to happen.
1528          * Actual injection of virtual interrupts happens through EVENTINJ.
1529          */
1530         control = &svm->vmcb->control;
1531         control->int_vector = 0x0;
1532         control->int_ctl &= ~V_INTR_PRIO_MASK;
1533         control->int_ctl |= V_IRQ_MASK |
1534                 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
1535         vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1536 }
1537
1538 static void svm_clear_vintr(struct vcpu_svm *svm)
1539 {
1540         svm_clr_intercept(svm, INTERCEPT_VINTR);
1541
1542         /* Drop int_ctl fields related to VINTR injection.  */
1543         svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1544         if (is_guest_mode(&svm->vcpu)) {
1545                 svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK;
1546
1547                 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) !=
1548                         (svm->nested.ctl.int_ctl & V_TPR_MASK));
1549
1550                 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
1551                         V_IRQ_INJECTION_BITS_MASK;
1552
1553                 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
1554         }
1555
1556         vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
1557 }
1558
1559 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
1560 {
1561         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1562         struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save;
1563
1564         switch (seg) {
1565         case VCPU_SREG_CS: return &save->cs;
1566         case VCPU_SREG_DS: return &save->ds;
1567         case VCPU_SREG_ES: return &save->es;
1568         case VCPU_SREG_FS: return &save01->fs;
1569         case VCPU_SREG_GS: return &save01->gs;
1570         case VCPU_SREG_SS: return &save->ss;
1571         case VCPU_SREG_TR: return &save01->tr;
1572         case VCPU_SREG_LDTR: return &save01->ldtr;
1573         }
1574         BUG();
1575         return NULL;
1576 }
1577
1578 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1579 {
1580         struct vmcb_seg *s = svm_seg(vcpu, seg);
1581
1582         return s->base;
1583 }
1584
1585 static void svm_get_segment(struct kvm_vcpu *vcpu,
1586                             struct kvm_segment *var, int seg)
1587 {
1588         struct vmcb_seg *s = svm_seg(vcpu, seg);
1589
1590         var->base = s->base;
1591         var->limit = s->limit;
1592         var->selector = s->selector;
1593         var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
1594         var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
1595         var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
1596         var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
1597         var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
1598         var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
1599         var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
1600
1601         /*
1602          * AMD CPUs circa 2014 track the G bit for all segments except CS.
1603          * However, the SVM spec states that the G bit is not observed by the
1604          * CPU, and some VMware virtual CPUs drop the G bit for all segments.
1605          * So let's synthesize a legal G bit for all segments, this helps
1606          * running KVM nested. It also helps cross-vendor migration, because
1607          * Intel's vmentry has a check on the 'G' bit.
1608          */
1609         var->g = s->limit > 0xfffff;
1610
1611         /*
1612          * AMD's VMCB does not have an explicit unusable field, so emulate it
1613          * for cross vendor migration purposes by "not present"
1614          */
1615         var->unusable = !var->present;
1616
1617         switch (seg) {
1618         case VCPU_SREG_TR:
1619                 /*
1620                  * Work around a bug where the busy flag in the tr selector
1621                  * isn't exposed
1622                  */
1623                 var->type |= 0x2;
1624                 break;
1625         case VCPU_SREG_DS:
1626         case VCPU_SREG_ES:
1627         case VCPU_SREG_FS:
1628         case VCPU_SREG_GS:
1629                 /*
1630                  * The accessed bit must always be set in the segment
1631                  * descriptor cache, although it can be cleared in the
1632                  * descriptor, the cached bit always remains at 1. Since
1633                  * Intel has a check on this, set it here to support
1634                  * cross-vendor migration.
1635                  */
1636                 if (!var->unusable)
1637                         var->type |= 0x1;
1638                 break;
1639         case VCPU_SREG_SS:
1640                 /*
1641                  * On AMD CPUs sometimes the DB bit in the segment
1642                  * descriptor is left as 1, although the whole segment has
1643                  * been made unusable. Clear it here to pass an Intel VMX
1644                  * entry check when cross vendor migrating.
1645                  */
1646                 if (var->unusable)
1647                         var->db = 0;
1648                 /* This is symmetric with svm_set_segment() */
1649                 var->dpl = to_svm(vcpu)->vmcb->save.cpl;
1650                 break;
1651         }
1652 }
1653
1654 static int svm_get_cpl(struct kvm_vcpu *vcpu)
1655 {
1656         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
1657
1658         return save->cpl;
1659 }
1660
1661 static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1662 {
1663         struct kvm_segment cs;
1664
1665         svm_get_segment(vcpu, &cs, VCPU_SREG_CS);
1666         *db = cs.db;
1667         *l = cs.l;
1668 }
1669
1670 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1671 {
1672         struct vcpu_svm *svm = to_svm(vcpu);
1673
1674         dt->size = svm->vmcb->save.idtr.limit;
1675         dt->address = svm->vmcb->save.idtr.base;
1676 }
1677
1678 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1679 {
1680         struct vcpu_svm *svm = to_svm(vcpu);
1681
1682         svm->vmcb->save.idtr.limit = dt->size;
1683         svm->vmcb->save.idtr.base = dt->address ;
1684         vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1685 }
1686
1687 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1688 {
1689         struct vcpu_svm *svm = to_svm(vcpu);
1690
1691         dt->size = svm->vmcb->save.gdtr.limit;
1692         dt->address = svm->vmcb->save.gdtr.base;
1693 }
1694
1695 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1696 {
1697         struct vcpu_svm *svm = to_svm(vcpu);
1698
1699         svm->vmcb->save.gdtr.limit = dt->size;
1700         svm->vmcb->save.gdtr.base = dt->address ;
1701         vmcb_mark_dirty(svm->vmcb, VMCB_DT);
1702 }
1703
1704 static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1705 {
1706         struct vcpu_svm *svm = to_svm(vcpu);
1707
1708         /*
1709          * For guests that don't set guest_state_protected, the cr3 update is
1710          * handled via kvm_mmu_load() while entering the guest. For guests
1711          * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
1712          * VMCB save area now, since the save area will become the initial
1713          * contents of the VMSA, and future VMCB save area updates won't be
1714          * seen.
1715          */
1716         if (sev_es_guest(vcpu->kvm)) {
1717                 svm->vmcb->save.cr3 = cr3;
1718                 vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1719         }
1720 }
1721
1722 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1723 {
1724         struct vcpu_svm *svm = to_svm(vcpu);
1725         u64 hcr0 = cr0;
1726         bool old_paging = is_paging(vcpu);
1727
1728 #ifdef CONFIG_X86_64
1729         if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) {
1730                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
1731                         vcpu->arch.efer |= EFER_LMA;
1732                         svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
1733                 }
1734
1735                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
1736                         vcpu->arch.efer &= ~EFER_LMA;
1737                         svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
1738                 }
1739         }
1740 #endif
1741         vcpu->arch.cr0 = cr0;
1742
1743         if (!npt_enabled) {
1744                 hcr0 |= X86_CR0_PG | X86_CR0_WP;
1745                 if (old_paging != is_paging(vcpu))
1746                         svm_set_cr4(vcpu, kvm_read_cr4(vcpu));
1747         }
1748
1749         /*
1750          * re-enable caching here because the QEMU bios
1751          * does not do it - this results in some delay at
1752          * reboot
1753          */
1754         if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
1755                 hcr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1756
1757         svm->vmcb->save.cr0 = hcr0;
1758         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
1759
1760         /*
1761          * SEV-ES guests must always keep the CR intercepts cleared. CR
1762          * tracking is done using the CR write traps.
1763          */
1764         if (sev_es_guest(vcpu->kvm))
1765                 return;
1766
1767         if (hcr0 == cr0) {
1768                 /* Selective CR0 write remains on.  */
1769                 svm_clr_intercept(svm, INTERCEPT_CR0_READ);
1770                 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE);
1771         } else {
1772                 svm_set_intercept(svm, INTERCEPT_CR0_READ);
1773                 svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
1774         }
1775 }
1776
1777 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1778 {
1779         return true;
1780 }
1781
1782 void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1783 {
1784         unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
1785         unsigned long old_cr4 = vcpu->arch.cr4;
1786
1787         if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1788                 svm_flush_tlb_current(vcpu);
1789
1790         vcpu->arch.cr4 = cr4;
1791         if (!npt_enabled) {
1792                 cr4 |= X86_CR4_PAE;
1793
1794                 if (!is_paging(vcpu))
1795                         cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
1796         }
1797         cr4 |= host_cr4_mce;
1798         to_svm(vcpu)->vmcb->save.cr4 = cr4;
1799         vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1800
1801         if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
1802                 kvm_update_cpuid_runtime(vcpu);
1803 }
1804
1805 static void svm_set_segment(struct kvm_vcpu *vcpu,
1806                             struct kvm_segment *var, int seg)
1807 {
1808         struct vcpu_svm *svm = to_svm(vcpu);
1809         struct vmcb_seg *s = svm_seg(vcpu, seg);
1810
1811         s->base = var->base;
1812         s->limit = var->limit;
1813         s->selector = var->selector;
1814         s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
1815         s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
1816         s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
1817         s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
1818         s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
1819         s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
1820         s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
1821         s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
1822
1823         /*
1824          * This is always accurate, except if SYSRET returned to a segment
1825          * with SS.DPL != 3.  Intel does not have this quirk, and always
1826          * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
1827          * would entail passing the CPL to userspace and back.
1828          */
1829         if (seg == VCPU_SREG_SS)
1830                 /* This is symmetric with svm_get_segment() */
1831                 svm->vmcb->save.cpl = (var->dpl & 3);
1832
1833         vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
1834 }
1835
1836 static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
1837 {
1838         struct vcpu_svm *svm = to_svm(vcpu);
1839
1840         clr_exception_intercept(svm, BP_VECTOR);
1841
1842         if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1843                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1844                         set_exception_intercept(svm, BP_VECTOR);
1845         }
1846 }
1847
1848 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1849 {
1850         if (sd->next_asid > sd->max_asid) {
1851                 ++sd->asid_generation;
1852                 sd->next_asid = sd->min_asid;
1853                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1854                 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
1855         }
1856
1857         svm->current_vmcb->asid_generation = sd->asid_generation;
1858         svm->asid = sd->next_asid++;
1859 }
1860
1861 static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value)
1862 {
1863         struct vmcb *vmcb = svm->vmcb;
1864
1865         if (svm->vcpu.arch.guest_state_protected)
1866                 return;
1867
1868         if (unlikely(value != vmcb->save.dr6)) {
1869                 vmcb->save.dr6 = value;
1870                 vmcb_mark_dirty(vmcb, VMCB_DR);
1871         }
1872 }
1873
1874 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
1875 {
1876         struct vcpu_svm *svm = to_svm(vcpu);
1877
1878         if (vcpu->arch.guest_state_protected)
1879                 return;
1880
1881         get_debugreg(vcpu->arch.db[0], 0);
1882         get_debugreg(vcpu->arch.db[1], 1);
1883         get_debugreg(vcpu->arch.db[2], 2);
1884         get_debugreg(vcpu->arch.db[3], 3);
1885         /*
1886          * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
1887          * because db_interception might need it.  We can do it before vmentry.
1888          */
1889         vcpu->arch.dr6 = svm->vmcb->save.dr6;
1890         vcpu->arch.dr7 = svm->vmcb->save.dr7;
1891         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
1892         set_dr_intercepts(svm);
1893 }
1894
1895 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1896 {
1897         struct vcpu_svm *svm = to_svm(vcpu);
1898
1899         if (vcpu->arch.guest_state_protected)
1900                 return;
1901
1902         svm->vmcb->save.dr7 = value;
1903         vmcb_mark_dirty(svm->vmcb, VMCB_DR);
1904 }
1905
1906 static int pf_interception(struct kvm_vcpu *vcpu)
1907 {
1908         struct vcpu_svm *svm = to_svm(vcpu);
1909
1910         u64 fault_address = svm->vmcb->control.exit_info_2;
1911         u64 error_code = svm->vmcb->control.exit_info_1;
1912
1913         return kvm_handle_page_fault(vcpu, error_code, fault_address,
1914                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1915                         svm->vmcb->control.insn_bytes : NULL,
1916                         svm->vmcb->control.insn_len);
1917 }
1918
1919 static int npf_interception(struct kvm_vcpu *vcpu)
1920 {
1921         struct vcpu_svm *svm = to_svm(vcpu);
1922
1923         u64 fault_address = svm->vmcb->control.exit_info_2;
1924         u64 error_code = svm->vmcb->control.exit_info_1;
1925
1926         trace_kvm_page_fault(fault_address, error_code);
1927         return kvm_mmu_page_fault(vcpu, fault_address, error_code,
1928                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
1929                         svm->vmcb->control.insn_bytes : NULL,
1930                         svm->vmcb->control.insn_len);
1931 }
1932
1933 static int db_interception(struct kvm_vcpu *vcpu)
1934 {
1935         struct kvm_run *kvm_run = vcpu->run;
1936         struct vcpu_svm *svm = to_svm(vcpu);
1937
1938         if (!(vcpu->guest_debug &
1939               (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1940                 !svm->nmi_singlestep) {
1941                 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
1942                 kvm_queue_exception_p(vcpu, DB_VECTOR, payload);
1943                 return 1;
1944         }
1945
1946         if (svm->nmi_singlestep) {
1947                 disable_nmi_singlestep(svm);
1948                 /* Make sure we check for pending NMIs upon entry */
1949                 kvm_make_request(KVM_REQ_EVENT, vcpu);
1950         }
1951
1952         if (vcpu->guest_debug &
1953             (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
1954                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1955                 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6;
1956                 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7;
1957                 kvm_run->debug.arch.pc =
1958                         svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1959                 kvm_run->debug.arch.exception = DB_VECTOR;
1960                 return 0;
1961         }
1962
1963         return 1;
1964 }
1965
1966 static int bp_interception(struct kvm_vcpu *vcpu)
1967 {
1968         struct vcpu_svm *svm = to_svm(vcpu);
1969         struct kvm_run *kvm_run = vcpu->run;
1970
1971         kvm_run->exit_reason = KVM_EXIT_DEBUG;
1972         kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1973         kvm_run->debug.arch.exception = BP_VECTOR;
1974         return 0;
1975 }
1976
1977 static int ud_interception(struct kvm_vcpu *vcpu)
1978 {
1979         return handle_ud(vcpu);
1980 }
1981
1982 static int ac_interception(struct kvm_vcpu *vcpu)
1983 {
1984         kvm_queue_exception_e(vcpu, AC_VECTOR, 0);
1985         return 1;
1986 }
1987
1988 static bool is_erratum_383(void)
1989 {
1990         int err, i;
1991         u64 value;
1992
1993         if (!erratum_383_found)
1994                 return false;
1995
1996         value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
1997         if (err)
1998                 return false;
1999
2000         /* Bit 62 may or may not be set for this mce */
2001         value &= ~(1ULL << 62);
2002
2003         if (value != 0xb600000000010015ULL)
2004                 return false;
2005
2006         /* Clear MCi_STATUS registers */
2007         for (i = 0; i < 6; ++i)
2008                 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2009
2010         value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2011         if (!err) {
2012                 u32 low, high;
2013
2014                 value &= ~(1ULL << 2);
2015                 low    = lower_32_bits(value);
2016                 high   = upper_32_bits(value);
2017
2018                 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2019         }
2020
2021         /* Flush tlb to evict multi-match entries */
2022         __flush_tlb_all();
2023
2024         return true;
2025 }
2026
2027 static void svm_handle_mce(struct kvm_vcpu *vcpu)
2028 {
2029         if (is_erratum_383()) {
2030                 /*
2031                  * Erratum 383 triggered. Guest state is corrupt so kill the
2032                  * guest.
2033                  */
2034                 pr_err("KVM: Guest triggered AMD Erratum 383\n");
2035
2036                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2037
2038                 return;
2039         }
2040
2041         /*
2042          * On an #MC intercept the MCE handler is not called automatically in
2043          * the host. So do it by hand here.
2044          */
2045         kvm_machine_check();
2046 }
2047
2048 static int mc_interception(struct kvm_vcpu *vcpu)
2049 {
2050         return 1;
2051 }
2052
2053 static int shutdown_interception(struct kvm_vcpu *vcpu)
2054 {
2055         struct kvm_run *kvm_run = vcpu->run;
2056         struct vcpu_svm *svm = to_svm(vcpu);
2057
2058         /*
2059          * The VM save area has already been encrypted so it
2060          * cannot be reinitialized - just terminate.
2061          */
2062         if (sev_es_guest(vcpu->kvm))
2063                 return -EINVAL;
2064
2065         /*
2066          * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
2067          * the VMCB in a known good state.  Unfortuately, KVM doesn't have
2068          * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
2069          * userspace.  At a platform view, INIT is acceptable behavior as
2070          * there exist bare metal platforms that automatically INIT the CPU
2071          * in response to shutdown.
2072          */
2073         clear_page(svm->vmcb);
2074         kvm_vcpu_reset(vcpu, true);
2075
2076         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2077         return 0;
2078 }
2079
2080 static int io_interception(struct kvm_vcpu *vcpu)
2081 {
2082         struct vcpu_svm *svm = to_svm(vcpu);
2083         u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2084         int size, in, string;
2085         unsigned port;
2086
2087         ++vcpu->stat.io_exits;
2088         string = (io_info & SVM_IOIO_STR_MASK) != 0;
2089         in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2090         port = io_info >> 16;
2091         size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2092
2093         if (string) {
2094                 if (sev_es_guest(vcpu->kvm))
2095                         return sev_es_string_io(svm, size, port, in);
2096                 else
2097                         return kvm_emulate_instruction(vcpu, 0);
2098         }
2099
2100         svm->next_rip = svm->vmcb->control.exit_info_2;
2101
2102         return kvm_fast_pio(vcpu, size, port, in);
2103 }
2104
2105 static int nmi_interception(struct kvm_vcpu *vcpu)
2106 {
2107         return 1;
2108 }
2109
2110 static int smi_interception(struct kvm_vcpu *vcpu)
2111 {
2112         return 1;
2113 }
2114
2115 static int intr_interception(struct kvm_vcpu *vcpu)
2116 {
2117         ++vcpu->stat.irq_exits;
2118         return 1;
2119 }
2120
2121 static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
2122 {
2123         struct vcpu_svm *svm = to_svm(vcpu);
2124         struct vmcb *vmcb12;
2125         struct kvm_host_map map;
2126         int ret;
2127
2128         if (nested_svm_check_permissions(vcpu))
2129                 return 1;
2130
2131         ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
2132         if (ret) {
2133                 if (ret == -EINVAL)
2134                         kvm_inject_gp(vcpu, 0);
2135                 return 1;
2136         }
2137
2138         vmcb12 = map.hva;
2139
2140         ret = kvm_skip_emulated_instruction(vcpu);
2141
2142         if (vmload) {
2143                 svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
2144                 svm->sysenter_eip_hi = 0;
2145                 svm->sysenter_esp_hi = 0;
2146         } else {
2147                 svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
2148         }
2149
2150         kvm_vcpu_unmap(vcpu, &map, true);
2151
2152         return ret;
2153 }
2154
2155 static int vmload_interception(struct kvm_vcpu *vcpu)
2156 {
2157         return vmload_vmsave_interception(vcpu, true);
2158 }
2159
2160 static int vmsave_interception(struct kvm_vcpu *vcpu)
2161 {
2162         return vmload_vmsave_interception(vcpu, false);
2163 }
2164
2165 static int vmrun_interception(struct kvm_vcpu *vcpu)
2166 {
2167         if (nested_svm_check_permissions(vcpu))
2168                 return 1;
2169
2170         return nested_svm_vmrun(vcpu);
2171 }
2172
2173 enum {
2174         NONE_SVM_INSTR,
2175         SVM_INSTR_VMRUN,
2176         SVM_INSTR_VMLOAD,
2177         SVM_INSTR_VMSAVE,
2178 };
2179
2180 /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
2181 static int svm_instr_opcode(struct kvm_vcpu *vcpu)
2182 {
2183         struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
2184
2185         if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
2186                 return NONE_SVM_INSTR;
2187
2188         switch (ctxt->modrm) {
2189         case 0xd8: /* VMRUN */
2190                 return SVM_INSTR_VMRUN;
2191         case 0xda: /* VMLOAD */
2192                 return SVM_INSTR_VMLOAD;
2193         case 0xdb: /* VMSAVE */
2194                 return SVM_INSTR_VMSAVE;
2195         default:
2196                 break;
2197         }
2198
2199         return NONE_SVM_INSTR;
2200 }
2201
2202 static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
2203 {
2204         const int guest_mode_exit_codes[] = {
2205                 [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
2206                 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
2207                 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
2208         };
2209         int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = {
2210                 [SVM_INSTR_VMRUN] = vmrun_interception,
2211                 [SVM_INSTR_VMLOAD] = vmload_interception,
2212                 [SVM_INSTR_VMSAVE] = vmsave_interception,
2213         };
2214         struct vcpu_svm *svm = to_svm(vcpu);
2215         int ret;
2216
2217         if (is_guest_mode(vcpu)) {
2218                 /* Returns '1' or -errno on failure, '0' on success. */
2219                 ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]);
2220                 if (ret)
2221                         return ret;
2222                 return 1;
2223         }
2224         return svm_instr_handlers[opcode](vcpu);
2225 }
2226
2227 /*
2228  * #GP handling code. Note that #GP can be triggered under the following two
2229  * cases:
2230  *   1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
2231  *      some AMD CPUs when EAX of these instructions are in the reserved memory
2232  *      regions (e.g. SMM memory on host).
2233  *   2) VMware backdoor
2234  */
2235 static int gp_interception(struct kvm_vcpu *vcpu)
2236 {
2237         struct vcpu_svm *svm = to_svm(vcpu);
2238         u32 error_code = svm->vmcb->control.exit_info_1;
2239         int opcode;
2240
2241         /* Both #GP cases have zero error_code */
2242         if (error_code)
2243                 goto reinject;
2244
2245         /* Decode the instruction for usage later */
2246         if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
2247                 goto reinject;
2248
2249         opcode = svm_instr_opcode(vcpu);
2250
2251         if (opcode == NONE_SVM_INSTR) {
2252                 if (!enable_vmware_backdoor)
2253                         goto reinject;
2254
2255                 /*
2256                  * VMware backdoor emulation on #GP interception only handles
2257                  * IN{S}, OUT{S}, and RDPMC.
2258                  */
2259                 if (!is_guest_mode(vcpu))
2260                         return kvm_emulate_instruction(vcpu,
2261                                 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
2262         } else {
2263                 /* All SVM instructions expect page aligned RAX */
2264                 if (svm->vmcb->save.rax & ~PAGE_MASK)
2265                         goto reinject;
2266
2267                 return emulate_svm_instr(vcpu, opcode);
2268         }
2269
2270 reinject:
2271         kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2272         return 1;
2273 }
2274
2275 void svm_set_gif(struct vcpu_svm *svm, bool value)
2276 {
2277         if (value) {
2278                 /*
2279                  * If VGIF is enabled, the STGI intercept is only added to
2280                  * detect the opening of the SMI/NMI window; remove it now.
2281                  * Likewise, clear the VINTR intercept, we will set it
2282                  * again while processing KVM_REQ_EVENT if needed.
2283                  */
2284                 if (vgif)
2285                         svm_clr_intercept(svm, INTERCEPT_STGI);
2286                 if (svm_is_intercept(svm, INTERCEPT_VINTR))
2287                         svm_clear_vintr(svm);
2288
2289                 enable_gif(svm);
2290                 if (svm->vcpu.arch.smi_pending ||
2291                     svm->vcpu.arch.nmi_pending ||
2292                     kvm_cpu_has_injectable_intr(&svm->vcpu))
2293                         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2294         } else {
2295                 disable_gif(svm);
2296
2297                 /*
2298                  * After a CLGI no interrupts should come.  But if vGIF is
2299                  * in use, we still rely on the VINTR intercept (rather than
2300                  * STGI) to detect an open interrupt window.
2301                 */
2302                 if (!vgif)
2303                         svm_clear_vintr(svm);
2304         }
2305 }
2306
2307 static int stgi_interception(struct kvm_vcpu *vcpu)
2308 {
2309         int ret;
2310
2311         if (nested_svm_check_permissions(vcpu))
2312                 return 1;
2313
2314         ret = kvm_skip_emulated_instruction(vcpu);
2315         svm_set_gif(to_svm(vcpu), true);
2316         return ret;
2317 }
2318
2319 static int clgi_interception(struct kvm_vcpu *vcpu)
2320 {
2321         int ret;
2322
2323         if (nested_svm_check_permissions(vcpu))
2324                 return 1;
2325
2326         ret = kvm_skip_emulated_instruction(vcpu);
2327         svm_set_gif(to_svm(vcpu), false);
2328         return ret;
2329 }
2330
2331 static int invlpga_interception(struct kvm_vcpu *vcpu)
2332 {
2333         gva_t gva = kvm_rax_read(vcpu);
2334         u32 asid = kvm_rcx_read(vcpu);
2335
2336         /* FIXME: Handle an address size prefix. */
2337         if (!is_long_mode(vcpu))
2338                 gva = (u32)gva;
2339
2340         trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva);
2341
2342         /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
2343         kvm_mmu_invlpg(vcpu, gva);
2344
2345         return kvm_skip_emulated_instruction(vcpu);
2346 }
2347
2348 static int skinit_interception(struct kvm_vcpu *vcpu)
2349 {
2350         trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu));
2351
2352         kvm_queue_exception(vcpu, UD_VECTOR);
2353         return 1;
2354 }
2355
2356 static int task_switch_interception(struct kvm_vcpu *vcpu)
2357 {
2358         struct vcpu_svm *svm = to_svm(vcpu);
2359         u16 tss_selector;
2360         int reason;
2361         int int_type = svm->vmcb->control.exit_int_info &
2362                 SVM_EXITINTINFO_TYPE_MASK;
2363         int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
2364         uint32_t type =
2365                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
2366         uint32_t idt_v =
2367                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
2368         bool has_error_code = false;
2369         u32 error_code = 0;
2370
2371         tss_selector = (u16)svm->vmcb->control.exit_info_1;
2372
2373         if (svm->vmcb->control.exit_info_2 &
2374             (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
2375                 reason = TASK_SWITCH_IRET;
2376         else if (svm->vmcb->control.exit_info_2 &
2377                  (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
2378                 reason = TASK_SWITCH_JMP;
2379         else if (idt_v)
2380                 reason = TASK_SWITCH_GATE;
2381         else
2382                 reason = TASK_SWITCH_CALL;
2383
2384         if (reason == TASK_SWITCH_GATE) {
2385                 switch (type) {
2386                 case SVM_EXITINTINFO_TYPE_NMI:
2387                         vcpu->arch.nmi_injected = false;
2388                         break;
2389                 case SVM_EXITINTINFO_TYPE_EXEPT:
2390                         if (svm->vmcb->control.exit_info_2 &
2391                             (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
2392                                 has_error_code = true;
2393                                 error_code =
2394                                         (u32)svm->vmcb->control.exit_info_2;
2395                         }
2396                         kvm_clear_exception_queue(vcpu);
2397                         break;
2398                 case SVM_EXITINTINFO_TYPE_INTR:
2399                         kvm_clear_interrupt_queue(vcpu);
2400                         break;
2401                 default:
2402                         break;
2403                 }
2404         }
2405
2406         if (reason != TASK_SWITCH_GATE ||
2407             int_type == SVM_EXITINTINFO_TYPE_SOFT ||
2408             (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
2409              (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
2410                 if (!svm_skip_emulated_instruction(vcpu))
2411                         return 0;
2412         }
2413
2414         if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
2415                 int_vec = -1;
2416
2417         return kvm_task_switch(vcpu, tss_selector, int_vec, reason,
2418                                has_error_code, error_code);
2419 }
2420
2421 static int iret_interception(struct kvm_vcpu *vcpu)
2422 {
2423         struct vcpu_svm *svm = to_svm(vcpu);
2424
2425         ++vcpu->stat.nmi_window_exits;
2426         vcpu->arch.hflags |= HF_IRET_MASK;
2427         if (!sev_es_guest(vcpu->kvm)) {
2428                 svm_clr_intercept(svm, INTERCEPT_IRET);
2429                 svm->nmi_iret_rip = kvm_rip_read(vcpu);
2430         }
2431         kvm_make_request(KVM_REQ_EVENT, vcpu);
2432         return 1;
2433 }
2434
2435 static int invlpg_interception(struct kvm_vcpu *vcpu)
2436 {
2437         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2438                 return kvm_emulate_instruction(vcpu, 0);
2439
2440         kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1);
2441         return kvm_skip_emulated_instruction(vcpu);
2442 }
2443
2444 static int emulate_on_interception(struct kvm_vcpu *vcpu)
2445 {
2446         return kvm_emulate_instruction(vcpu, 0);
2447 }
2448
2449 static int rsm_interception(struct kvm_vcpu *vcpu)
2450 {
2451         return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2);
2452 }
2453
2454 static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
2455                                             unsigned long val)
2456 {
2457         struct vcpu_svm *svm = to_svm(vcpu);
2458         unsigned long cr0 = vcpu->arch.cr0;
2459         bool ret = false;
2460
2461         if (!is_guest_mode(vcpu) ||
2462             (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
2463                 return false;
2464
2465         cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2466         val &= ~SVM_CR0_SELECTIVE_MASK;
2467
2468         if (cr0 ^ val) {
2469                 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2470                 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2471         }
2472
2473         return ret;
2474 }
2475
2476 #define CR_VALID (1ULL << 63)
2477
2478 static int cr_interception(struct kvm_vcpu *vcpu)
2479 {
2480         struct vcpu_svm *svm = to_svm(vcpu);
2481         int reg, cr;
2482         unsigned long val;
2483         int err;
2484
2485         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2486                 return emulate_on_interception(vcpu);
2487
2488         if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2489                 return emulate_on_interception(vcpu);
2490
2491         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2492         if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
2493                 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
2494         else
2495                 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2496
2497         err = 0;
2498         if (cr >= 16) { /* mov to cr */
2499                 cr -= 16;
2500                 val = kvm_register_read(vcpu, reg);
2501                 trace_kvm_cr_write(cr, val);
2502                 switch (cr) {
2503                 case 0:
2504                         if (!check_selective_cr0_intercepted(vcpu, val))
2505                                 err = kvm_set_cr0(vcpu, val);
2506                         else
2507                                 return 1;
2508
2509                         break;
2510                 case 3:
2511                         err = kvm_set_cr3(vcpu, val);
2512                         break;
2513                 case 4:
2514                         err = kvm_set_cr4(vcpu, val);
2515                         break;
2516                 case 8:
2517                         err = kvm_set_cr8(vcpu, val);
2518                         break;
2519                 default:
2520                         WARN(1, "unhandled write to CR%d", cr);
2521                         kvm_queue_exception(vcpu, UD_VECTOR);
2522                         return 1;
2523                 }
2524         } else { /* mov from cr */
2525                 switch (cr) {
2526                 case 0:
2527                         val = kvm_read_cr0(vcpu);
2528                         break;
2529                 case 2:
2530                         val = vcpu->arch.cr2;
2531                         break;
2532                 case 3:
2533                         val = kvm_read_cr3(vcpu);
2534                         break;
2535                 case 4:
2536                         val = kvm_read_cr4(vcpu);
2537                         break;
2538                 case 8:
2539                         val = kvm_get_cr8(vcpu);
2540                         break;
2541                 default:
2542                         WARN(1, "unhandled read from CR%d", cr);
2543                         kvm_queue_exception(vcpu, UD_VECTOR);
2544                         return 1;
2545                 }
2546                 kvm_register_write(vcpu, reg, val);
2547                 trace_kvm_cr_read(cr, val);
2548         }
2549         return kvm_complete_insn_gp(vcpu, err);
2550 }
2551
2552 static int cr_trap(struct kvm_vcpu *vcpu)
2553 {
2554         struct vcpu_svm *svm = to_svm(vcpu);
2555         unsigned long old_value, new_value;
2556         unsigned int cr;
2557         int ret = 0;
2558
2559         new_value = (unsigned long)svm->vmcb->control.exit_info_1;
2560
2561         cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP;
2562         switch (cr) {
2563         case 0:
2564                 old_value = kvm_read_cr0(vcpu);
2565                 svm_set_cr0(vcpu, new_value);
2566
2567                 kvm_post_set_cr0(vcpu, old_value, new_value);
2568                 break;
2569         case 4:
2570                 old_value = kvm_read_cr4(vcpu);
2571                 svm_set_cr4(vcpu, new_value);
2572
2573                 kvm_post_set_cr4(vcpu, old_value, new_value);
2574                 break;
2575         case 8:
2576                 ret = kvm_set_cr8(vcpu, new_value);
2577                 break;
2578         default:
2579                 WARN(1, "unhandled CR%d write trap", cr);
2580                 kvm_queue_exception(vcpu, UD_VECTOR);
2581                 return 1;
2582         }
2583
2584         return kvm_complete_insn_gp(vcpu, ret);
2585 }
2586
2587 static int dr_interception(struct kvm_vcpu *vcpu)
2588 {
2589         struct vcpu_svm *svm = to_svm(vcpu);
2590         int reg, dr;
2591         unsigned long val;
2592         int err = 0;
2593
2594         if (vcpu->guest_debug == 0) {
2595                 /*
2596                  * No more DR vmexits; force a reload of the debug registers
2597                  * and reenter on this instruction.  The next vmexit will
2598                  * retrieve the full state of the debug registers.
2599                  */
2600                 clr_dr_intercepts(svm);
2601                 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
2602                 return 1;
2603         }
2604
2605         if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2606                 return emulate_on_interception(vcpu);
2607
2608         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2609         dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2610         if (dr >= 16) { /* mov to DRn  */
2611                 dr -= 16;
2612                 val = kvm_register_read(vcpu, reg);
2613                 err = kvm_set_dr(vcpu, dr, val);
2614         } else {
2615                 kvm_get_dr(vcpu, dr, &val);
2616                 kvm_register_write(vcpu, reg, val);
2617         }
2618
2619         return kvm_complete_insn_gp(vcpu, err);
2620 }
2621
2622 static int cr8_write_interception(struct kvm_vcpu *vcpu)
2623 {
2624         int r;
2625
2626         u8 cr8_prev = kvm_get_cr8(vcpu);
2627         /* instruction emulation calls kvm_set_cr8() */
2628         r = cr_interception(vcpu);
2629         if (lapic_in_kernel(vcpu))
2630                 return r;
2631         if (cr8_prev <= kvm_get_cr8(vcpu))
2632                 return r;
2633         vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2634         return 0;
2635 }
2636
2637 static int efer_trap(struct kvm_vcpu *vcpu)
2638 {
2639         struct msr_data msr_info;
2640         int ret;
2641
2642         /*
2643          * Clear the EFER_SVME bit from EFER. The SVM code always sets this
2644          * bit in svm_set_efer(), but __kvm_valid_efer() checks it against
2645          * whether the guest has X86_FEATURE_SVM - this avoids a failure if
2646          * the guest doesn't have X86_FEATURE_SVM.
2647          */
2648         msr_info.host_initiated = false;
2649         msr_info.index = MSR_EFER;
2650         msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME;
2651         ret = kvm_set_msr_common(vcpu, &msr_info);
2652
2653         return kvm_complete_insn_gp(vcpu, ret);
2654 }
2655
2656 static int svm_get_msr_feature(struct kvm_msr_entry *msr)
2657 {
2658         msr->data = 0;
2659
2660         switch (msr->index) {
2661         case MSR_F10H_DECFG:
2662                 if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
2663                         msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
2664                 break;
2665         case MSR_IA32_PERF_CAPABILITIES:
2666                 return 0;
2667         default:
2668                 return KVM_MSR_RET_INVALID;
2669         }
2670
2671         return 0;
2672 }
2673
2674 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2675 {
2676         struct vcpu_svm *svm = to_svm(vcpu);
2677
2678         switch (msr_info->index) {
2679         case MSR_AMD64_TSC_RATIO:
2680                 if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
2681                         return 1;
2682                 msr_info->data = svm->tsc_ratio_msr;
2683                 break;
2684         case MSR_STAR:
2685                 msr_info->data = svm->vmcb01.ptr->save.star;
2686                 break;
2687 #ifdef CONFIG_X86_64
2688         case MSR_LSTAR:
2689                 msr_info->data = svm->vmcb01.ptr->save.lstar;
2690                 break;
2691         case MSR_CSTAR:
2692                 msr_info->data = svm->vmcb01.ptr->save.cstar;
2693                 break;
2694         case MSR_KERNEL_GS_BASE:
2695                 msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base;
2696                 break;
2697         case MSR_SYSCALL_MASK:
2698                 msr_info->data = svm->vmcb01.ptr->save.sfmask;
2699                 break;
2700 #endif
2701         case MSR_IA32_SYSENTER_CS:
2702                 msr_info->data = svm->vmcb01.ptr->save.sysenter_cs;
2703                 break;
2704         case MSR_IA32_SYSENTER_EIP:
2705                 msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip;
2706                 if (guest_cpuid_is_intel(vcpu))
2707                         msr_info->data |= (u64)svm->sysenter_eip_hi << 32;
2708                 break;
2709         case MSR_IA32_SYSENTER_ESP:
2710                 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp;
2711                 if (guest_cpuid_is_intel(vcpu))
2712                         msr_info->data |= (u64)svm->sysenter_esp_hi << 32;
2713                 break;
2714         case MSR_TSC_AUX:
2715                 msr_info->data = svm->tsc_aux;
2716                 break;
2717         case MSR_IA32_DEBUGCTLMSR:
2718         case MSR_IA32_LASTBRANCHFROMIP:
2719         case MSR_IA32_LASTBRANCHTOIP:
2720         case MSR_IA32_LASTINTFROMIP:
2721         case MSR_IA32_LASTINTTOIP:
2722                 msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
2723                 break;
2724         case MSR_VM_HSAVE_PA:
2725                 msr_info->data = svm->nested.hsave_msr;
2726                 break;
2727         case MSR_VM_CR:
2728                 msr_info->data = svm->nested.vm_cr_msr;
2729                 break;
2730         case MSR_IA32_SPEC_CTRL:
2731                 if (!msr_info->host_initiated &&
2732                     !guest_has_spec_ctrl_msr(vcpu))
2733                         return 1;
2734
2735                 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2736                         msr_info->data = svm->vmcb->save.spec_ctrl;
2737                 else
2738                         msr_info->data = svm->spec_ctrl;
2739                 break;
2740         case MSR_AMD64_VIRT_SPEC_CTRL:
2741                 if (!msr_info->host_initiated &&
2742                     !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2743                         return 1;
2744
2745                 msr_info->data = svm->virt_spec_ctrl;
2746                 break;
2747         case MSR_F15H_IC_CFG: {
2748
2749                 int family, model;
2750
2751                 family = guest_cpuid_family(vcpu);
2752                 model  = guest_cpuid_model(vcpu);
2753
2754                 if (family < 0 || model < 0)
2755                         return kvm_get_msr_common(vcpu, msr_info);
2756
2757                 msr_info->data = 0;
2758
2759                 if (family == 0x15 &&
2760                     (model >= 0x2 && model < 0x20))
2761                         msr_info->data = 0x1E;
2762                 }
2763                 break;
2764         case MSR_F10H_DECFG:
2765                 msr_info->data = svm->msr_decfg;
2766                 break;
2767         default:
2768                 return kvm_get_msr_common(vcpu, msr_info);
2769         }
2770         return 0;
2771 }
2772
2773 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2774 {
2775         struct vcpu_svm *svm = to_svm(vcpu);
2776         if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb))
2777                 return kvm_complete_insn_gp(vcpu, err);
2778
2779         ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1);
2780         ghcb_set_sw_exit_info_2(svm->sev_es.ghcb,
2781                                 X86_TRAP_GP |
2782                                 SVM_EVTINJ_TYPE_EXEPT |
2783                                 SVM_EVTINJ_VALID);
2784         return 1;
2785 }
2786
2787 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
2788 {
2789         struct vcpu_svm *svm = to_svm(vcpu);
2790         int svm_dis, chg_mask;
2791
2792         if (data & ~SVM_VM_CR_VALID_MASK)
2793                 return 1;
2794
2795         chg_mask = SVM_VM_CR_VALID_MASK;
2796
2797         if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
2798                 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
2799
2800         svm->nested.vm_cr_msr &= ~chg_mask;
2801         svm->nested.vm_cr_msr |= (data & chg_mask);
2802
2803         svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
2804
2805         /* check for svm_disable while efer.svme is set */
2806         if (svm_dis && (vcpu->arch.efer & EFER_SVME))
2807                 return 1;
2808
2809         return 0;
2810 }
2811
2812 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2813 {
2814         struct vcpu_svm *svm = to_svm(vcpu);
2815         int r;
2816
2817         u32 ecx = msr->index;
2818         u64 data = msr->data;
2819         switch (ecx) {
2820         case MSR_AMD64_TSC_RATIO:
2821
2822                 if (!svm->tsc_scaling_enabled) {
2823
2824                         if (!msr->host_initiated)
2825                                 return 1;
2826                         /*
2827                          * In case TSC scaling is not enabled, always
2828                          * leave this MSR at the default value.
2829                          *
2830                          * Due to bug in qemu 6.2.0, it would try to set
2831                          * this msr to 0 if tsc scaling is not enabled.
2832                          * Ignore this value as well.
2833                          */
2834                         if (data != 0 && data != svm->tsc_ratio_msr)
2835                                 return 1;
2836                         break;
2837                 }
2838
2839                 if (data & SVM_TSC_RATIO_RSVD)
2840                         return 1;
2841
2842                 svm->tsc_ratio_msr = data;
2843
2844                 if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
2845                         nested_svm_update_tsc_ratio_msr(vcpu);
2846
2847                 break;
2848         case MSR_IA32_CR_PAT:
2849                 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
2850                         return 1;
2851                 vcpu->arch.pat = data;
2852                 svm->vmcb01.ptr->save.g_pat = data;
2853                 if (is_guest_mode(vcpu))
2854                         nested_vmcb02_compute_g_pat(svm);
2855                 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
2856                 break;
2857         case MSR_IA32_SPEC_CTRL:
2858                 if (!msr->host_initiated &&
2859                     !guest_has_spec_ctrl_msr(vcpu))
2860                         return 1;
2861
2862                 if (kvm_spec_ctrl_test_value(data))
2863                         return 1;
2864
2865                 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
2866                         svm->vmcb->save.spec_ctrl = data;
2867                 else
2868                         svm->spec_ctrl = data;
2869                 if (!data)
2870                         break;
2871
2872                 /*
2873                  * For non-nested:
2874                  * When it's written (to non-zero) for the first time, pass
2875                  * it through.
2876                  *
2877                  * For nested:
2878                  * The handling of the MSR bitmap for L2 guests is done in
2879                  * nested_svm_vmrun_msrpm.
2880                  * We update the L1 MSR bit as well since it will end up
2881                  * touching the MSR anyway now.
2882                  */
2883                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
2884                 break;
2885         case MSR_IA32_PRED_CMD:
2886                 if (!msr->host_initiated &&
2887                     !guest_has_pred_cmd_msr(vcpu))
2888                         return 1;
2889
2890                 if (data & ~PRED_CMD_IBPB)
2891                         return 1;
2892                 if (!boot_cpu_has(X86_FEATURE_IBPB))
2893                         return 1;
2894                 if (!data)
2895                         break;
2896
2897                 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2898                 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
2899                 break;
2900         case MSR_AMD64_VIRT_SPEC_CTRL:
2901                 if (!msr->host_initiated &&
2902                     !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
2903                         return 1;
2904
2905                 if (data & ~SPEC_CTRL_SSBD)
2906                         return 1;
2907
2908                 svm->virt_spec_ctrl = data;
2909                 break;
2910         case MSR_STAR:
2911                 svm->vmcb01.ptr->save.star = data;
2912                 break;
2913 #ifdef CONFIG_X86_64
2914         case MSR_LSTAR:
2915                 svm->vmcb01.ptr->save.lstar = data;
2916                 break;
2917         case MSR_CSTAR:
2918                 svm->vmcb01.ptr->save.cstar = data;
2919                 break;
2920         case MSR_KERNEL_GS_BASE:
2921                 svm->vmcb01.ptr->save.kernel_gs_base = data;
2922                 break;
2923         case MSR_SYSCALL_MASK:
2924                 svm->vmcb01.ptr->save.sfmask = data;
2925                 break;
2926 #endif
2927         case MSR_IA32_SYSENTER_CS:
2928                 svm->vmcb01.ptr->save.sysenter_cs = data;
2929                 break;
2930         case MSR_IA32_SYSENTER_EIP:
2931                 svm->vmcb01.ptr->save.sysenter_eip = (u32)data;
2932                 /*
2933                  * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs
2934                  * when we spoof an Intel vendor ID (for cross vendor migration).
2935                  * In this case we use this intercept to track the high
2936                  * 32 bit part of these msrs to support Intel's
2937                  * implementation of SYSENTER/SYSEXIT.
2938                  */
2939                 svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
2940                 break;
2941         case MSR_IA32_SYSENTER_ESP:
2942                 svm->vmcb01.ptr->save.sysenter_esp = (u32)data;
2943                 svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0;
2944                 break;
2945         case MSR_TSC_AUX:
2946                 /*
2947                  * TSC_AUX is usually changed only during boot and never read
2948                  * directly.  Intercept TSC_AUX instead of exposing it to the
2949                  * guest via direct_access_msrs, and switch it via user return.
2950                  */
2951                 preempt_disable();
2952                 r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
2953                 preempt_enable();
2954                 if (r)
2955                         return 1;
2956
2957                 svm->tsc_aux = data;
2958                 break;
2959         case MSR_IA32_DEBUGCTLMSR:
2960                 if (!lbrv) {
2961                         vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
2962                                     __func__, data);
2963                         break;
2964                 }
2965                 if (data & DEBUGCTL_RESERVED_BITS)
2966                         return 1;
2967
2968                 if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
2969                         svm->vmcb->save.dbgctl = data;
2970                 else
2971                         svm->vmcb01.ptr->save.dbgctl = data;
2972
2973                 svm_update_lbrv(vcpu);
2974
2975                 break;
2976         case MSR_VM_HSAVE_PA:
2977                 /*
2978                  * Old kernels did not validate the value written to
2979                  * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
2980                  * value to allow live migrating buggy or malicious guests
2981                  * originating from those kernels.
2982                  */
2983                 if (!msr->host_initiated && !page_address_valid(vcpu, data))
2984                         return 1;
2985
2986                 svm->nested.hsave_msr = data & PAGE_MASK;
2987                 break;
2988         case MSR_VM_CR:
2989                 return svm_set_vm_cr(vcpu, data);
2990         case MSR_VM_IGNNE:
2991                 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
2992                 break;
2993         case MSR_F10H_DECFG: {
2994                 struct kvm_msr_entry msr_entry;
2995
2996                 msr_entry.index = msr->index;
2997                 if (svm_get_msr_feature(&msr_entry))
2998                         return 1;
2999
3000                 /* Check the supported bits */
3001                 if (data & ~msr_entry.data)
3002                         return 1;
3003
3004                 /* Don't allow the guest to change a bit, #GP */
3005                 if (!msr->host_initiated && (data ^ msr_entry.data))
3006                         return 1;
3007
3008                 svm->msr_decfg = data;
3009                 break;
3010         }
3011         default:
3012                 return kvm_set_msr_common(vcpu, msr);
3013         }
3014         return 0;
3015 }
3016
3017 static int msr_interception(struct kvm_vcpu *vcpu)
3018 {
3019         if (to_svm(vcpu)->vmcb->control.exit_info_1)
3020                 return kvm_emulate_wrmsr(vcpu);
3021         else
3022                 return kvm_emulate_rdmsr(vcpu);
3023 }
3024
3025 static int interrupt_window_interception(struct kvm_vcpu *vcpu)
3026 {
3027         kvm_make_request(KVM_REQ_EVENT, vcpu);
3028         svm_clear_vintr(to_svm(vcpu));
3029
3030         /*
3031          * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
3032          * In this case AVIC was temporarily disabled for
3033          * requesting the IRQ window and we have to re-enable it.
3034          *
3035          * If running nested, still remove the VM wide AVIC inhibit to
3036          * support case in which the interrupt window was requested when the
3037          * vCPU was not running nested.
3038
3039          * All vCPUs which run still run nested, will remain to have their
3040          * AVIC still inhibited due to per-cpu AVIC inhibition.
3041          */
3042         kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3043
3044         ++vcpu->stat.irq_window_exits;
3045         return 1;
3046 }
3047
3048 static int pause_interception(struct kvm_vcpu *vcpu)
3049 {
3050         bool in_kernel;
3051         /*
3052          * CPL is not made available for an SEV-ES guest, therefore
3053          * vcpu->arch.preempted_in_kernel can never be true.  Just
3054          * set in_kernel to false as well.
3055          */
3056         in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
3057
3058         grow_ple_window(vcpu);
3059
3060         kvm_vcpu_on_spin(vcpu, in_kernel);
3061         return kvm_skip_emulated_instruction(vcpu);
3062 }
3063
3064 static int invpcid_interception(struct kvm_vcpu *vcpu)
3065 {
3066         struct vcpu_svm *svm = to_svm(vcpu);
3067         unsigned long type;
3068         gva_t gva;
3069
3070         if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
3071                 kvm_queue_exception(vcpu, UD_VECTOR);
3072                 return 1;
3073         }
3074
3075         /*
3076          * For an INVPCID intercept:
3077          * EXITINFO1 provides the linear address of the memory operand.
3078          * EXITINFO2 provides the contents of the register operand.
3079          */
3080         type = svm->vmcb->control.exit_info_2;
3081         gva = svm->vmcb->control.exit_info_1;
3082
3083         return kvm_handle_invpcid(vcpu, type, gva);
3084 }
3085
3086 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3087         [SVM_EXIT_READ_CR0]                     = cr_interception,
3088         [SVM_EXIT_READ_CR3]                     = cr_interception,
3089         [SVM_EXIT_READ_CR4]                     = cr_interception,
3090         [SVM_EXIT_READ_CR8]                     = cr_interception,
3091         [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
3092         [SVM_EXIT_WRITE_CR0]                    = cr_interception,
3093         [SVM_EXIT_WRITE_CR3]                    = cr_interception,
3094         [SVM_EXIT_WRITE_CR4]                    = cr_interception,
3095         [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
3096         [SVM_EXIT_READ_DR0]                     = dr_interception,
3097         [SVM_EXIT_READ_DR1]                     = dr_interception,
3098         [SVM_EXIT_READ_DR2]                     = dr_interception,
3099         [SVM_EXIT_READ_DR3]                     = dr_interception,
3100         [SVM_EXIT_READ_DR4]                     = dr_interception,
3101         [SVM_EXIT_READ_DR5]                     = dr_interception,
3102         [SVM_EXIT_READ_DR6]                     = dr_interception,
3103         [SVM_EXIT_READ_DR7]                     = dr_interception,
3104         [SVM_EXIT_WRITE_DR0]                    = dr_interception,
3105         [SVM_EXIT_WRITE_DR1]                    = dr_interception,
3106         [SVM_EXIT_WRITE_DR2]                    = dr_interception,
3107         [SVM_EXIT_WRITE_DR3]                    = dr_interception,
3108         [SVM_EXIT_WRITE_DR4]                    = dr_interception,
3109         [SVM_EXIT_WRITE_DR5]                    = dr_interception,
3110         [SVM_EXIT_WRITE_DR6]                    = dr_interception,
3111         [SVM_EXIT_WRITE_DR7]                    = dr_interception,
3112         [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
3113         [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
3114         [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
3115         [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
3116         [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
3117         [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
3118         [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
3119         [SVM_EXIT_INTR]                         = intr_interception,
3120         [SVM_EXIT_NMI]                          = nmi_interception,
3121         [SVM_EXIT_SMI]                          = smi_interception,
3122         [SVM_EXIT_VINTR]                        = interrupt_window_interception,
3123         [SVM_EXIT_RDPMC]                        = kvm_emulate_rdpmc,
3124         [SVM_EXIT_CPUID]                        = kvm_emulate_cpuid,
3125         [SVM_EXIT_IRET]                         = iret_interception,
3126         [SVM_EXIT_INVD]                         = kvm_emulate_invd,
3127         [SVM_EXIT_PAUSE]                        = pause_interception,
3128         [SVM_EXIT_HLT]                          = kvm_emulate_halt,
3129         [SVM_EXIT_INVLPG]                       = invlpg_interception,
3130         [SVM_EXIT_INVLPGA]                      = invlpga_interception,
3131         [SVM_EXIT_IOIO]                         = io_interception,
3132         [SVM_EXIT_MSR]                          = msr_interception,
3133         [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
3134         [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
3135         [SVM_EXIT_VMRUN]                        = vmrun_interception,
3136         [SVM_EXIT_VMMCALL]                      = kvm_emulate_hypercall,
3137         [SVM_EXIT_VMLOAD]                       = vmload_interception,
3138         [SVM_EXIT_VMSAVE]                       = vmsave_interception,
3139         [SVM_EXIT_STGI]                         = stgi_interception,
3140         [SVM_EXIT_CLGI]                         = clgi_interception,
3141         [SVM_EXIT_SKINIT]                       = skinit_interception,
3142         [SVM_EXIT_RDTSCP]                       = kvm_handle_invalid_op,
3143         [SVM_EXIT_WBINVD]                       = kvm_emulate_wbinvd,
3144         [SVM_EXIT_MONITOR]                      = kvm_emulate_monitor,
3145         [SVM_EXIT_MWAIT]                        = kvm_emulate_mwait,
3146         [SVM_EXIT_XSETBV]                       = kvm_emulate_xsetbv,
3147         [SVM_EXIT_RDPRU]                        = kvm_handle_invalid_op,
3148         [SVM_EXIT_EFER_WRITE_TRAP]              = efer_trap,
3149         [SVM_EXIT_CR0_WRITE_TRAP]               = cr_trap,
3150         [SVM_EXIT_CR4_WRITE_TRAP]               = cr_trap,
3151         [SVM_EXIT_CR8_WRITE_TRAP]               = cr_trap,
3152         [SVM_EXIT_INVPCID]                      = invpcid_interception,
3153         [SVM_EXIT_NPF]                          = npf_interception,
3154         [SVM_EXIT_RSM]                          = rsm_interception,
3155         [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
3156         [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
3157         [SVM_EXIT_VMGEXIT]                      = sev_handle_vmgexit,
3158 };
3159
3160 static void dump_vmcb(struct kvm_vcpu *vcpu)
3161 {
3162         struct vcpu_svm *svm = to_svm(vcpu);
3163         struct vmcb_control_area *control = &svm->vmcb->control;
3164         struct vmcb_save_area *save = &svm->vmcb->save;
3165         struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
3166
3167         if (!dump_invalid_vmcb) {
3168                 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
3169                 return;
3170         }
3171
3172         pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
3173                svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
3174         pr_err("VMCB Control Area:\n");
3175         pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
3176         pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
3177         pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff);
3178         pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16);
3179         pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]);
3180         pr_err("%-20s%08x %08x\n", "intercepts:",
3181               control->intercepts[INTERCEPT_WORD3],
3182                control->intercepts[INTERCEPT_WORD4]);
3183         pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3184         pr_err("%-20s%d\n", "pause filter threshold:",
3185                control->pause_filter_thresh);
3186         pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3187         pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3188         pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3189         pr_err("%-20s%d\n", "asid:", control->asid);
3190         pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3191         pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3192         pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3193         pr_err("%-20s%08x\n", "int_state:", control->int_state);
3194         pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3195         pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3196         pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3197         pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3198         pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3199         pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3200         pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3201         pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
3202         pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa);
3203         pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3204         pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3205         pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
3206         pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3207         pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
3208         pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
3209         pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
3210         pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
3211         pr_err("VMCB State Save Area:\n");
3212         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3213                "es:",
3214                save->es.selector, save->es.attrib,
3215                save->es.limit, save->es.base);
3216         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3217                "cs:",
3218                save->cs.selector, save->cs.attrib,
3219                save->cs.limit, save->cs.base);
3220         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3221                "ss:",
3222                save->ss.selector, save->ss.attrib,
3223                save->ss.limit, save->ss.base);
3224         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3225                "ds:",
3226                save->ds.selector, save->ds.attrib,
3227                save->ds.limit, save->ds.base);
3228         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3229                "fs:",
3230                save01->fs.selector, save01->fs.attrib,
3231                save01->fs.limit, save01->fs.base);
3232         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3233                "gs:",
3234                save01->gs.selector, save01->gs.attrib,
3235                save01->gs.limit, save01->gs.base);
3236         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3237                "gdtr:",
3238                save->gdtr.selector, save->gdtr.attrib,
3239                save->gdtr.limit, save->gdtr.base);
3240         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3241                "ldtr:",
3242                save01->ldtr.selector, save01->ldtr.attrib,
3243                save01->ldtr.limit, save01->ldtr.base);
3244         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3245                "idtr:",
3246                save->idtr.selector, save->idtr.attrib,
3247                save->idtr.limit, save->idtr.base);
3248         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3249                "tr:",
3250                save01->tr.selector, save01->tr.attrib,
3251                save01->tr.limit, save01->tr.base);
3252         pr_err("vmpl: %d   cpl:  %d               efer:          %016llx\n",
3253                save->vmpl, save->cpl, save->efer);
3254         pr_err("%-15s %016llx %-13s %016llx\n",
3255                "cr0:", save->cr0, "cr2:", save->cr2);
3256         pr_err("%-15s %016llx %-13s %016llx\n",
3257                "cr3:", save->cr3, "cr4:", save->cr4);
3258         pr_err("%-15s %016llx %-13s %016llx\n",
3259                "dr6:", save->dr6, "dr7:", save->dr7);
3260         pr_err("%-15s %016llx %-13s %016llx\n",
3261                "rip:", save->rip, "rflags:", save->rflags);
3262         pr_err("%-15s %016llx %-13s %016llx\n",
3263                "rsp:", save->rsp, "rax:", save->rax);
3264         pr_err("%-15s %016llx %-13s %016llx\n",
3265                "star:", save01->star, "lstar:", save01->lstar);
3266         pr_err("%-15s %016llx %-13s %016llx\n",
3267                "cstar:", save01->cstar, "sfmask:", save01->sfmask);
3268         pr_err("%-15s %016llx %-13s %016llx\n",
3269                "kernel_gs_base:", save01->kernel_gs_base,
3270                "sysenter_cs:", save01->sysenter_cs);
3271         pr_err("%-15s %016llx %-13s %016llx\n",
3272                "sysenter_esp:", save01->sysenter_esp,
3273                "sysenter_eip:", save01->sysenter_eip);
3274         pr_err("%-15s %016llx %-13s %016llx\n",
3275                "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3276         pr_err("%-15s %016llx %-13s %016llx\n",
3277                "br_from:", save->br_from, "br_to:", save->br_to);
3278         pr_err("%-15s %016llx %-13s %016llx\n",
3279                "excp_from:", save->last_excp_from,
3280                "excp_to:", save->last_excp_to);
3281 }
3282
3283 static bool svm_check_exit_valid(u64 exit_code)
3284 {
3285         return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
3286                 svm_exit_handlers[exit_code]);
3287 }
3288
3289 static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
3290 {
3291         vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
3292         dump_vmcb(vcpu);
3293         vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3294         vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
3295         vcpu->run->internal.ndata = 2;
3296         vcpu->run->internal.data[0] = exit_code;
3297         vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
3298         return 0;
3299 }
3300
3301 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
3302 {
3303         if (!svm_check_exit_valid(exit_code))
3304                 return svm_handle_invalid_exit(vcpu, exit_code);
3305
3306 #ifdef CONFIG_RETPOLINE
3307         if (exit_code == SVM_EXIT_MSR)
3308                 return msr_interception(vcpu);
3309         else if (exit_code == SVM_EXIT_VINTR)
3310                 return interrupt_window_interception(vcpu);
3311         else if (exit_code == SVM_EXIT_INTR)
3312                 return intr_interception(vcpu);
3313         else if (exit_code == SVM_EXIT_HLT)
3314                 return kvm_emulate_halt(vcpu);
3315         else if (exit_code == SVM_EXIT_NPF)
3316                 return npf_interception(vcpu);
3317 #endif
3318         return svm_exit_handlers[exit_code](vcpu);
3319 }
3320
3321 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
3322                               u64 *info1, u64 *info2,
3323                               u32 *intr_info, u32 *error_code)
3324 {
3325         struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3326
3327         *reason = control->exit_code;
3328         *info1 = control->exit_info_1;
3329         *info2 = control->exit_info_2;
3330         *intr_info = control->exit_int_info;
3331         if ((*intr_info & SVM_EXITINTINFO_VALID) &&
3332             (*intr_info & SVM_EXITINTINFO_VALID_ERR))
3333                 *error_code = control->exit_int_info_err;
3334         else
3335                 *error_code = 0;
3336 }
3337
3338 static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
3339 {
3340         struct vcpu_svm *svm = to_svm(vcpu);
3341         struct kvm_run *kvm_run = vcpu->run;
3342         u32 exit_code = svm->vmcb->control.exit_code;
3343
3344         trace_kvm_exit(vcpu, KVM_ISA_SVM);
3345
3346         /* SEV-ES guests must use the CR write traps to track CR registers. */
3347         if (!sev_es_guest(vcpu->kvm)) {
3348                 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE))
3349                         vcpu->arch.cr0 = svm->vmcb->save.cr0;
3350                 if (npt_enabled)
3351                         vcpu->arch.cr3 = svm->vmcb->save.cr3;
3352         }
3353
3354         if (is_guest_mode(vcpu)) {
3355                 int vmexit;
3356
3357                 trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
3358
3359                 vmexit = nested_svm_exit_special(svm);
3360
3361                 if (vmexit == NESTED_EXIT_CONTINUE)
3362                         vmexit = nested_svm_exit_handled(svm);
3363
3364                 if (vmexit == NESTED_EXIT_DONE)
3365                         return 1;
3366         }
3367
3368         if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
3369                 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3370                 kvm_run->fail_entry.hardware_entry_failure_reason
3371                         = svm->vmcb->control.exit_code;
3372                 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
3373                 dump_vmcb(vcpu);
3374                 return 0;
3375         }
3376
3377         if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
3378             exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
3379             exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3380             exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
3381                 printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
3382                        "exit_code 0x%x\n",
3383                        __func__, svm->vmcb->control.exit_int_info,
3384                        exit_code);
3385
3386         if (exit_fastpath != EXIT_FASTPATH_NONE)
3387                 return 1;
3388
3389         return svm_invoke_exit_handler(vcpu, exit_code);
3390 }
3391
3392 static void reload_tss(struct kvm_vcpu *vcpu)
3393 {
3394         struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3395
3396         sd->tss_desc->type = 9; /* available 32/64-bit TSS */
3397         load_TR_desc();
3398 }
3399
3400 static void pre_svm_run(struct kvm_vcpu *vcpu)
3401 {
3402         struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3403         struct vcpu_svm *svm = to_svm(vcpu);
3404
3405         /*
3406          * If the previous vmrun of the vmcb occurred on a different physical
3407          * cpu, then mark the vmcb dirty and assign a new asid.  Hardware's
3408          * vmcb clean bits are per logical CPU, as are KVM's asid assignments.
3409          */
3410         if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) {
3411                 svm->current_vmcb->asid_generation = 0;
3412                 vmcb_mark_all_dirty(svm->vmcb);
3413                 svm->current_vmcb->cpu = vcpu->cpu;
3414         }
3415
3416         if (sev_guest(vcpu->kvm))
3417                 return pre_sev_run(svm, vcpu->cpu);
3418
3419         /* FIXME: handle wraparound of asid_generation */
3420         if (svm->current_vmcb->asid_generation != sd->asid_generation)
3421                 new_asid(svm, sd);
3422 }
3423
3424 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
3425 {
3426         struct vcpu_svm *svm = to_svm(vcpu);
3427
3428         svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
3429
3430         if (svm->nmi_l1_to_l2)
3431                 return;
3432
3433         vcpu->arch.hflags |= HF_NMI_MASK;
3434         if (!sev_es_guest(vcpu->kvm))
3435                 svm_set_intercept(svm, INTERCEPT_IRET);
3436         ++vcpu->stat.nmi_injections;
3437 }
3438
3439 static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
3440 {
3441         struct vcpu_svm *svm = to_svm(vcpu);
3442         u32 type;
3443
3444         if (vcpu->arch.interrupt.soft) {
3445                 if (svm_update_soft_interrupt_rip(vcpu))
3446                         return;
3447
3448                 type = SVM_EVTINJ_TYPE_SOFT;
3449         } else {
3450                 type = SVM_EVTINJ_TYPE_INTR;
3451         }
3452
3453         trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
3454                            vcpu->arch.interrupt.soft, reinjected);
3455         ++vcpu->stat.irq_injections;
3456
3457         svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
3458                                        SVM_EVTINJ_VALID | type;
3459 }
3460
3461 void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
3462                                      int trig_mode, int vector)
3463 {
3464         /*
3465          * apic->apicv_active must be read after vcpu->mode.
3466          * Pairs with smp_store_release in vcpu_enter_guest.
3467          */
3468         bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE);
3469
3470         /* Note, this is called iff the local APIC is in-kernel. */
3471         if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
3472                 /* Process the interrupt via inject_pending_event */
3473                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3474                 kvm_vcpu_kick(vcpu);
3475                 return;
3476         }
3477
3478         trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
3479         if (in_guest_mode) {
3480                 /*
3481                  * Signal the doorbell to tell hardware to inject the IRQ.  If
3482                  * the vCPU exits the guest before the doorbell chimes, hardware
3483                  * will automatically process AVIC interrupts at the next VMRUN.
3484                  */
3485                 avic_ring_doorbell(vcpu);
3486         } else {
3487                 /*
3488                  * Wake the vCPU if it was blocking.  KVM will then detect the
3489                  * pending IRQ when checking if the vCPU has a wake event.
3490                  */
3491                 kvm_vcpu_wake_up(vcpu);
3492         }
3493 }
3494
3495 static void svm_deliver_interrupt(struct kvm_lapic *apic,  int delivery_mode,
3496                                   int trig_mode, int vector)
3497 {
3498         kvm_lapic_set_irr(vector, apic);
3499
3500         /*
3501          * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
3502          * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
3503          * the read of guest_mode.  This guarantees that either VMRUN will see
3504          * and process the new vIRR entry, or that svm_complete_interrupt_delivery
3505          * will signal the doorbell if the CPU has already entered the guest.
3506          */
3507         smp_mb__after_atomic();
3508         svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector);
3509 }
3510
3511 static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3512 {
3513         struct vcpu_svm *svm = to_svm(vcpu);
3514
3515         /*
3516          * SEV-ES guests must always keep the CR intercepts cleared. CR
3517          * tracking is done using the CR write traps.
3518          */
3519         if (sev_es_guest(vcpu->kvm))
3520                 return;
3521
3522         if (nested_svm_virtualize_tpr(vcpu))
3523                 return;
3524
3525         svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
3526
3527         if (irr == -1)
3528                 return;
3529
3530         if (tpr >= irr)
3531                 svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
3532 }
3533
3534 bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
3535 {
3536         struct vcpu_svm *svm = to_svm(vcpu);
3537         struct vmcb *vmcb = svm->vmcb;
3538         bool ret;
3539
3540         if (!gif_set(svm))
3541                 return true;
3542
3543         if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3544                 return false;
3545
3546         ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
3547               (vcpu->arch.hflags & HF_NMI_MASK);
3548
3549         return ret;
3550 }
3551
3552 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3553 {
3554         struct vcpu_svm *svm = to_svm(vcpu);
3555         if (svm->nested.nested_run_pending)
3556                 return -EBUSY;
3557
3558         if (svm_nmi_blocked(vcpu))
3559                 return 0;
3560
3561         /* An NMI must not be injected into L2 if it's supposed to VM-Exit.  */
3562         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
3563                 return -EBUSY;
3564         return 1;
3565 }
3566
3567 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
3568 {
3569         return !!(vcpu->arch.hflags & HF_NMI_MASK);
3570 }
3571
3572 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
3573 {
3574         struct vcpu_svm *svm = to_svm(vcpu);
3575
3576         if (masked) {
3577                 vcpu->arch.hflags |= HF_NMI_MASK;
3578                 if (!sev_es_guest(vcpu->kvm))
3579                         svm_set_intercept(svm, INTERCEPT_IRET);
3580         } else {
3581                 vcpu->arch.hflags &= ~HF_NMI_MASK;
3582                 if (!sev_es_guest(vcpu->kvm))
3583                         svm_clr_intercept(svm, INTERCEPT_IRET);
3584         }
3585 }
3586
3587 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
3588 {
3589         struct vcpu_svm *svm = to_svm(vcpu);
3590         struct vmcb *vmcb = svm->vmcb;
3591
3592         if (!gif_set(svm))
3593                 return true;
3594
3595         if (is_guest_mode(vcpu)) {
3596                 /* As long as interrupts are being delivered...  */
3597                 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
3598                     ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
3599                     : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
3600                         return true;
3601
3602                 /* ... vmexits aren't blocked by the interrupt shadow  */
3603                 if (nested_exit_on_intr(svm))
3604                         return false;
3605         } else {
3606                 if (!svm_get_if_flag(vcpu))
3607                         return true;
3608         }
3609
3610         return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK);
3611 }
3612
3613 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
3614 {
3615         struct vcpu_svm *svm = to_svm(vcpu);
3616
3617         if (svm->nested.nested_run_pending)
3618                 return -EBUSY;
3619
3620         if (svm_interrupt_blocked(vcpu))
3621                 return 0;
3622
3623         /*
3624          * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
3625          * e.g. if the IRQ arrived asynchronously after checking nested events.
3626          */
3627         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm))
3628                 return -EBUSY;
3629
3630         return 1;
3631 }
3632
3633 static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
3634 {
3635         struct vcpu_svm *svm = to_svm(vcpu);
3636
3637         /*
3638          * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
3639          * 1, because that's a separate STGI/VMRUN intercept.  The next time we
3640          * get that intercept, this function will be called again though and
3641          * we'll get the vintr intercept. However, if the vGIF feature is
3642          * enabled, the STGI interception will not occur. Enable the irq
3643          * window under the assumption that the hardware will set the GIF.
3644          */
3645         if (vgif || gif_set(svm)) {
3646                 /*
3647                  * IRQ window is not needed when AVIC is enabled,
3648                  * unless we have pending ExtINT since it cannot be injected
3649                  * via AVIC. In such case, KVM needs to temporarily disable AVIC,
3650                  * and fallback to injecting IRQ via V_IRQ.
3651                  *
3652                  * If running nested, AVIC is already locally inhibited
3653                  * on this vCPU, therefore there is no need to request
3654                  * the VM wide AVIC inhibition.
3655                  */
3656                 if (!is_guest_mode(vcpu))
3657                         kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
3658
3659                 svm_set_vintr(svm);
3660         }
3661 }
3662
3663 static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
3664 {
3665         struct vcpu_svm *svm = to_svm(vcpu);
3666
3667         if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
3668                 return; /* IRET will cause a vm exit */
3669
3670         if (!gif_set(svm)) {
3671                 if (vgif)
3672                         svm_set_intercept(svm, INTERCEPT_STGI);
3673                 return; /* STGI will cause a vm exit */
3674         }
3675
3676         /*
3677          * Something prevents NMI from been injected. Single step over possible
3678          * problem (IRET or exception injection or interrupt shadow)
3679          */
3680         svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
3681         svm->nmi_singlestep = true;
3682         svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
3683 }
3684
3685 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
3686 {
3687         struct vcpu_svm *svm = to_svm(vcpu);
3688
3689         /*
3690          * Flush only the current ASID even if the TLB flush was invoked via
3691          * kvm_flush_remote_tlbs().  Although flushing remote TLBs requires all
3692          * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
3693          * unconditionally does a TLB flush on both nested VM-Enter and nested
3694          * VM-Exit (via kvm_mmu_reset_context()).
3695          */
3696         if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3697                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3698         else
3699                 svm->current_vmcb->asid_generation--;
3700 }
3701
3702 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
3703 {
3704         struct vcpu_svm *svm = to_svm(vcpu);
3705
3706         invlpga(gva, svm->vmcb->control.asid);
3707 }
3708
3709 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3710 {
3711         struct vcpu_svm *svm = to_svm(vcpu);
3712
3713         if (nested_svm_virtualize_tpr(vcpu))
3714                 return;
3715
3716         if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) {
3717                 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3718                 kvm_set_cr8(vcpu, cr8);
3719         }
3720 }
3721
3722 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3723 {
3724         struct vcpu_svm *svm = to_svm(vcpu);
3725         u64 cr8;
3726
3727         if (nested_svm_virtualize_tpr(vcpu) ||
3728             kvm_vcpu_apicv_active(vcpu))
3729                 return;
3730
3731         cr8 = kvm_get_cr8(vcpu);
3732         svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
3733         svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
3734 }
3735
3736 static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector,
3737                                         int type)
3738 {
3739         bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT);
3740         bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT);
3741         struct vcpu_svm *svm = to_svm(vcpu);
3742
3743         /*
3744          * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's
3745          * associated with the original soft exception/interrupt.  next_rip is
3746          * cleared on all exits that can occur while vectoring an event, so KVM
3747          * needs to manually set next_rip for re-injection.  Unlike the !nrips
3748          * case below, this needs to be done if and only if KVM is re-injecting
3749          * the same event, i.e. if the event is a soft exception/interrupt,
3750          * otherwise next_rip is unused on VMRUN.
3751          */
3752         if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) &&
3753             kvm_is_linear_rip(vcpu, svm->soft_int_old_rip + svm->soft_int_csbase))
3754                 svm->vmcb->control.next_rip = svm->soft_int_next_rip;
3755         /*
3756          * If NRIPS isn't enabled, KVM must manually advance RIP prior to
3757          * injecting the soft exception/interrupt.  That advancement needs to
3758          * be unwound if vectoring didn't complete.  Note, the new event may
3759          * not be the injected event, e.g. if KVM injected an INTn, the INTn
3760          * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will
3761          * be the reported vectored event, but RIP still needs to be unwound.
3762          */
3763         else if (!nrips && (is_soft || is_exception) &&
3764                  kvm_is_linear_rip(vcpu, svm->soft_int_next_rip + svm->soft_int_csbase))
3765                 kvm_rip_write(vcpu, svm->soft_int_old_rip);
3766 }
3767
3768 static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
3769 {
3770         struct vcpu_svm *svm = to_svm(vcpu);
3771         u8 vector;
3772         int type;
3773         u32 exitintinfo = svm->vmcb->control.exit_int_info;
3774         bool nmi_l1_to_l2 = svm->nmi_l1_to_l2;
3775         bool soft_int_injected = svm->soft_int_injected;
3776
3777         svm->nmi_l1_to_l2 = false;
3778         svm->soft_int_injected = false;
3779
3780         /*
3781          * If we've made progress since setting HF_IRET_MASK, we've
3782          * executed an IRET and can allow NMI injection.
3783          */
3784         if ((vcpu->arch.hflags & HF_IRET_MASK) &&
3785             (sev_es_guest(vcpu->kvm) ||
3786              kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
3787                 vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3788                 kvm_make_request(KVM_REQ_EVENT, vcpu);
3789         }
3790
3791         vcpu->arch.nmi_injected = false;
3792         kvm_clear_exception_queue(vcpu);
3793         kvm_clear_interrupt_queue(vcpu);
3794
3795         if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3796                 return;
3797
3798         kvm_make_request(KVM_REQ_EVENT, vcpu);
3799
3800         vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3801         type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3802
3803         if (soft_int_injected)
3804                 svm_complete_soft_interrupt(vcpu, vector, type);
3805
3806         switch (type) {
3807         case SVM_EXITINTINFO_TYPE_NMI:
3808                 vcpu->arch.nmi_injected = true;
3809                 svm->nmi_l1_to_l2 = nmi_l1_to_l2;
3810                 break;
3811         case SVM_EXITINTINFO_TYPE_EXEPT:
3812                 /*
3813                  * Never re-inject a #VC exception.
3814                  */
3815                 if (vector == X86_TRAP_VC)
3816                         break;
3817
3818                 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
3819                         u32 err = svm->vmcb->control.exit_int_info_err;
3820                         kvm_requeue_exception_e(vcpu, vector, err);
3821
3822                 } else
3823                         kvm_requeue_exception(vcpu, vector);
3824                 break;
3825         case SVM_EXITINTINFO_TYPE_INTR:
3826                 kvm_queue_interrupt(vcpu, vector, false);
3827                 break;
3828         case SVM_EXITINTINFO_TYPE_SOFT:
3829                 kvm_queue_interrupt(vcpu, vector, true);
3830                 break;
3831         default:
3832                 break;
3833         }
3834
3835 }
3836
3837 static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3838 {
3839         struct vcpu_svm *svm = to_svm(vcpu);
3840         struct vmcb_control_area *control = &svm->vmcb->control;
3841
3842         control->exit_int_info = control->event_inj;
3843         control->exit_int_info_err = control->event_inj_err;
3844         control->event_inj = 0;
3845         svm_complete_interrupts(vcpu);
3846 }
3847
3848 static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
3849 {
3850         return 1;
3851 }
3852
3853 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
3854 {
3855         if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
3856             to_svm(vcpu)->vmcb->control.exit_info_1)
3857                 return handle_fastpath_set_msr_irqoff(vcpu);
3858
3859         return EXIT_FASTPATH_NONE;
3860 }
3861
3862 static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
3863 {
3864         struct vcpu_svm *svm = to_svm(vcpu);
3865         unsigned long vmcb_pa = svm->current_vmcb->pa;
3866
3867         guest_state_enter_irqoff();
3868
3869         if (sev_es_guest(vcpu->kvm)) {
3870                 __svm_sev_es_vcpu_run(vmcb_pa);
3871         } else {
3872                 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
3873
3874                 /*
3875                  * Use a single vmcb (vmcb01 because it's always valid) for
3876                  * context switching guest state via VMLOAD/VMSAVE, that way
3877                  * the state doesn't need to be copied between vmcb01 and
3878                  * vmcb02 when switching vmcbs for nested virtualization.
3879                  */
3880                 vmload(svm->vmcb01.pa);
3881                 __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs);
3882                 vmsave(svm->vmcb01.pa);
3883
3884                 vmload(__sme_page_pa(sd->save_area));
3885         }
3886
3887         guest_state_exit_irqoff();
3888 }
3889
3890 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
3891 {
3892         struct vcpu_svm *svm = to_svm(vcpu);
3893
3894         trace_kvm_entry(vcpu);
3895
3896         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3897         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3898         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
3899
3900         /*
3901          * Disable singlestep if we're injecting an interrupt/exception.
3902          * We don't want our modified rflags to be pushed on the stack where
3903          * we might not be able to easily reset them if we disabled NMI
3904          * singlestep later.
3905          */
3906         if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
3907                 /*
3908                  * Event injection happens before external interrupts cause a
3909                  * vmexit and interrupts are disabled here, so smp_send_reschedule
3910                  * is enough to force an immediate vmexit.
3911                  */
3912                 disable_nmi_singlestep(svm);
3913                 smp_send_reschedule(vcpu->cpu);
3914         }
3915
3916         pre_svm_run(vcpu);
3917
3918         sync_lapic_to_cr8(vcpu);
3919
3920         if (unlikely(svm->asid != svm->vmcb->control.asid)) {
3921                 svm->vmcb->control.asid = svm->asid;
3922                 vmcb_mark_dirty(svm->vmcb, VMCB_ASID);
3923         }
3924         svm->vmcb->save.cr2 = vcpu->arch.cr2;
3925
3926         svm_hv_update_vp_id(svm->vmcb, vcpu);
3927
3928         /*
3929          * Run with all-zero DR6 unless needed, so that we can get the exact cause
3930          * of a #DB.
3931          */
3932         if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
3933                 svm_set_dr6(svm, vcpu->arch.dr6);
3934         else
3935                 svm_set_dr6(svm, DR6_ACTIVE_LOW);
3936
3937         clgi();
3938         kvm_load_guest_xsave_state(vcpu);
3939
3940         kvm_wait_lapic_expire(vcpu);
3941
3942         /*
3943          * If this vCPU has touched SPEC_CTRL, restore the guest's value if
3944          * it's non-zero. Since vmentry is serialising on affected CPUs, there
3945          * is no need to worry about the conditional branch over the wrmsr
3946          * being speculatively taken.
3947          */
3948         if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3949                 x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
3950
3951         svm_vcpu_enter_exit(vcpu);
3952
3953         /*
3954          * We do not use IBRS in the kernel. If this vCPU has used the
3955          * SPEC_CTRL MSR it may have left it on; save the value and
3956          * turn it off. This is much more efficient than blindly adding
3957          * it to the atomic save/restore list. Especially as the former
3958          * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
3959          *
3960          * For non-nested case:
3961          * If the L01 MSR bitmap does not intercept the MSR, then we need to
3962          * save it.
3963          *
3964          * For nested case:
3965          * If the L02 MSR bitmap does not intercept the MSR, then we need to
3966          * save it.
3967          */
3968         if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) &&
3969             unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
3970                 svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
3971
3972         if (!sev_es_guest(vcpu->kvm))
3973                 reload_tss(vcpu);
3974
3975         if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL))
3976                 x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
3977
3978         if (!sev_es_guest(vcpu->kvm)) {
3979                 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3980                 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3981                 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3982                 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3983         }
3984         vcpu->arch.regs_dirty = 0;
3985
3986         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3987                 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
3988
3989         kvm_load_host_xsave_state(vcpu);
3990         stgi();
3991
3992         /* Any pending NMI will happen here */
3993
3994         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3995                 kvm_after_interrupt(vcpu);
3996
3997         sync_cr8_to_lapic(vcpu);
3998
3999         svm->next_rip = 0;
4000         if (is_guest_mode(vcpu)) {
4001                 nested_sync_control_from_vmcb02(svm);
4002
4003                 /* Track VMRUNs that have made past consistency checking */
4004                 if (svm->nested.nested_run_pending &&
4005                     svm->vmcb->control.exit_code != SVM_EXIT_ERR)
4006                         ++vcpu->stat.nested_run;
4007
4008                 svm->nested.nested_run_pending = 0;
4009         }
4010
4011         svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
4012         vmcb_mark_all_clean(svm->vmcb);
4013
4014         /* if exit due to PF check for async PF */
4015         if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
4016                 vcpu->arch.apf.host_apf_flags =
4017                         kvm_read_and_reset_apf_flags();
4018
4019         vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
4020
4021         /*
4022          * We need to handle MC intercepts here before the vcpu has a chance to
4023          * change the physical cpu
4024          */
4025         if (unlikely(svm->vmcb->control.exit_code ==
4026                      SVM_EXIT_EXCP_BASE + MC_VECTOR))
4027                 svm_handle_mce(vcpu);
4028
4029         svm_complete_interrupts(vcpu);
4030
4031         if (is_guest_mode(vcpu))
4032                 return EXIT_FASTPATH_NONE;
4033
4034         return svm_exit_handlers_fastpath(vcpu);
4035 }
4036
4037 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
4038                              int root_level)
4039 {
4040         struct vcpu_svm *svm = to_svm(vcpu);
4041         unsigned long cr3;
4042
4043         if (npt_enabled) {
4044                 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
4045                 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
4046
4047                 hv_track_root_tdp(vcpu, root_hpa);
4048
4049                 cr3 = vcpu->arch.cr3;
4050         } else if (root_level >= PT64_ROOT_4LEVEL) {
4051                 cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
4052         } else {
4053                 /* PCID in the guest should be impossible with a 32-bit MMU. */
4054                 WARN_ON_ONCE(kvm_get_active_pcid(vcpu));
4055                 cr3 = root_hpa;
4056         }
4057
4058         svm->vmcb->save.cr3 = cr3;
4059         vmcb_mark_dirty(svm->vmcb, VMCB_CR);
4060 }
4061
4062 static int is_disabled(void)
4063 {
4064         u64 vm_cr;
4065
4066         rdmsrl(MSR_VM_CR, vm_cr);
4067         if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
4068                 return 1;
4069
4070         return 0;
4071 }
4072
4073 static void
4074 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
4075 {
4076         /*
4077          * Patch in the VMMCALL instruction:
4078          */
4079         hypercall[0] = 0x0f;
4080         hypercall[1] = 0x01;
4081         hypercall[2] = 0xd9;
4082 }
4083
4084 static int __init svm_check_processor_compat(void)
4085 {
4086         return 0;
4087 }
4088
4089 /*
4090  * The kvm parameter can be NULL (module initialization, or invocation before
4091  * VM creation). Be sure to check the kvm parameter before using it.
4092  */
4093 static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
4094 {
4095         switch (index) {
4096         case MSR_IA32_MCG_EXT_CTL:
4097         case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
4098                 return false;
4099         case MSR_IA32_SMBASE:
4100                 /* SEV-ES guests do not support SMM, so report false */
4101                 if (kvm && sev_es_guest(kvm))
4102                         return false;
4103                 break;
4104         default:
4105                 break;
4106         }
4107
4108         return true;
4109 }
4110
4111 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
4112 {
4113         return 0;
4114 }
4115
4116 static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
4117 {
4118         struct vcpu_svm *svm = to_svm(vcpu);
4119         struct kvm_cpuid_entry2 *best;
4120         struct kvm *kvm = vcpu->kvm;
4121
4122         vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4123                                     boot_cpu_has(X86_FEATURE_XSAVE) &&
4124                                     boot_cpu_has(X86_FEATURE_XSAVES);
4125
4126         /* Update nrips enabled cache */
4127         svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
4128                              guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
4129
4130         svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
4131         svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
4132
4133         svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
4134
4135         svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
4136                         guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
4137
4138         svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
4139                         guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
4140
4141         svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
4142
4143         svm_recalc_instruction_intercepts(vcpu, svm);
4144
4145         /* For sev guests, the memory encryption bit is not reserved in CR3.  */
4146         if (sev_guest(vcpu->kvm)) {
4147                 best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0);
4148                 if (best)
4149                         vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
4150         }
4151
4152         if (kvm_vcpu_apicv_active(vcpu)) {
4153                 /*
4154                  * AVIC does not work with an x2APIC mode guest. If the X2APIC feature
4155                  * is exposed to the guest, disable AVIC.
4156                  */
4157                 if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
4158                         kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC);
4159         }
4160         init_vmcb_after_set_cpuid(vcpu);
4161 }
4162
4163 static bool svm_has_wbinvd_exit(void)
4164 {
4165         return true;
4166 }
4167
4168 #define PRE_EX(exit)  { .exit_code = (exit), \
4169                         .stage = X86_ICPT_PRE_EXCEPT, }
4170 #define POST_EX(exit) { .exit_code = (exit), \
4171                         .stage = X86_ICPT_POST_EXCEPT, }
4172 #define POST_MEM(exit) { .exit_code = (exit), \
4173                         .stage = X86_ICPT_POST_MEMACCESS, }
4174
4175 static const struct __x86_intercept {
4176         u32 exit_code;
4177         enum x86_intercept_stage stage;
4178 } x86_intercept_map[] = {
4179         [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
4180         [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
4181         [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
4182         [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
4183         [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
4184         [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
4185         [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
4186         [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
4187         [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
4188         [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
4189         [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
4190         [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
4191         [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
4192         [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
4193         [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
4194         [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
4195         [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
4196         [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
4197         [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
4198         [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
4199         [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
4200         [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
4201         [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
4202         [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
4203         [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
4204         [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
4205         [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
4206         [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
4207         [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
4208         [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
4209         [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
4210         [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
4211         [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
4212         [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
4213         [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
4214         [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
4215         [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
4216         [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
4217         [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
4218         [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
4219         [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
4220         [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
4221         [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
4222         [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
4223         [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
4224         [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
4225         [x86_intercept_xsetbv]          = PRE_EX(SVM_EXIT_XSETBV),
4226 };
4227
4228 #undef PRE_EX
4229 #undef POST_EX
4230 #undef POST_MEM
4231
4232 static int svm_check_intercept(struct kvm_vcpu *vcpu,
4233                                struct x86_instruction_info *info,
4234                                enum x86_intercept_stage stage,
4235                                struct x86_exception *exception)
4236 {
4237         struct vcpu_svm *svm = to_svm(vcpu);
4238         int vmexit, ret = X86EMUL_CONTINUE;
4239         struct __x86_intercept icpt_info;
4240         struct vmcb *vmcb = svm->vmcb;
4241
4242         if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4243                 goto out;
4244
4245         icpt_info = x86_intercept_map[info->intercept];
4246
4247         if (stage != icpt_info.stage)
4248                 goto out;
4249
4250         switch (icpt_info.exit_code) {
4251         case SVM_EXIT_READ_CR0:
4252                 if (info->intercept == x86_intercept_cr_read)
4253                         icpt_info.exit_code += info->modrm_reg;
4254                 break;
4255         case SVM_EXIT_WRITE_CR0: {
4256                 unsigned long cr0, val;
4257
4258                 if (info->intercept == x86_intercept_cr_write)
4259                         icpt_info.exit_code += info->modrm_reg;
4260
4261                 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
4262                     info->intercept == x86_intercept_clts)
4263                         break;
4264
4265                 if (!(vmcb12_is_intercept(&svm->nested.ctl,
4266                                         INTERCEPT_SELECTIVE_CR0)))
4267                         break;
4268
4269                 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4270                 val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
4271
4272                 if (info->intercept == x86_intercept_lmsw) {
4273                         cr0 &= 0xfUL;
4274                         val &= 0xfUL;
4275                         /* lmsw can't clear PE - catch this here */
4276                         if (cr0 & X86_CR0_PE)
4277                                 val |= X86_CR0_PE;
4278                 }
4279
4280                 if (cr0 ^ val)
4281                         icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4282
4283                 break;
4284         }
4285         case SVM_EXIT_READ_DR0:
4286         case SVM_EXIT_WRITE_DR0:
4287                 icpt_info.exit_code += info->modrm_reg;
4288                 break;
4289         case SVM_EXIT_MSR:
4290                 if (info->intercept == x86_intercept_wrmsr)
4291                         vmcb->control.exit_info_1 = 1;
4292                 else
4293                         vmcb->control.exit_info_1 = 0;
4294                 break;
4295         case SVM_EXIT_PAUSE:
4296                 /*
4297                  * We get this for NOP only, but pause
4298                  * is rep not, check this here
4299                  */
4300                 if (info->rep_prefix != REPE_PREFIX)
4301                         goto out;
4302                 break;
4303         case SVM_EXIT_IOIO: {
4304                 u64 exit_info;
4305                 u32 bytes;
4306
4307                 if (info->intercept == x86_intercept_in ||
4308                     info->intercept == x86_intercept_ins) {
4309                         exit_info = ((info->src_val & 0xffff) << 16) |
4310                                 SVM_IOIO_TYPE_MASK;
4311                         bytes = info->dst_bytes;
4312                 } else {
4313                         exit_info = (info->dst_val & 0xffff) << 16;
4314                         bytes = info->src_bytes;
4315                 }
4316
4317                 if (info->intercept == x86_intercept_outs ||
4318                     info->intercept == x86_intercept_ins)
4319                         exit_info |= SVM_IOIO_STR_MASK;
4320
4321                 if (info->rep_prefix)
4322                         exit_info |= SVM_IOIO_REP_MASK;
4323
4324                 bytes = min(bytes, 4u);
4325
4326                 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4327
4328                 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4329
4330                 vmcb->control.exit_info_1 = exit_info;
4331                 vmcb->control.exit_info_2 = info->next_rip;
4332
4333                 break;
4334         }
4335         default:
4336                 break;
4337         }
4338
4339         /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
4340         if (static_cpu_has(X86_FEATURE_NRIPS))
4341                 vmcb->control.next_rip  = info->next_rip;
4342         vmcb->control.exit_code = icpt_info.exit_code;
4343         vmexit = nested_svm_exit_handled(svm);
4344
4345         ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4346                                            : X86EMUL_CONTINUE;
4347
4348 out:
4349         return ret;
4350 }
4351
4352 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
4353 {
4354         if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
4355                 vcpu->arch.at_instruction_boundary = true;
4356 }
4357
4358 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
4359 {
4360         if (!kvm_pause_in_guest(vcpu->kvm))
4361                 shrink_ple_window(vcpu);
4362 }
4363
4364 static void svm_setup_mce(struct kvm_vcpu *vcpu)
4365 {
4366         /* [63:9] are reserved. */
4367         vcpu->arch.mcg_cap &= 0x1ff;
4368 }
4369
4370 bool svm_smi_blocked(struct kvm_vcpu *vcpu)
4371 {
4372         struct vcpu_svm *svm = to_svm(vcpu);
4373
4374         /* Per APM Vol.2 15.22.2 "Response to SMI" */
4375         if (!gif_set(svm))
4376                 return true;
4377
4378         return is_smm(vcpu);
4379 }
4380
4381 static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
4382 {
4383         struct vcpu_svm *svm = to_svm(vcpu);
4384         if (svm->nested.nested_run_pending)
4385                 return -EBUSY;
4386
4387         if (svm_smi_blocked(vcpu))
4388                 return 0;
4389
4390         /* An SMI must not be injected into L2 if it's supposed to VM-Exit.  */
4391         if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm))
4392                 return -EBUSY;
4393
4394         return 1;
4395 }
4396
4397 static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
4398 {
4399         struct vcpu_svm *svm = to_svm(vcpu);
4400         struct kvm_host_map map_save;
4401         int ret;
4402
4403         if (!is_guest_mode(vcpu))
4404                 return 0;
4405
4406         /* FED8h - SVM Guest */
4407         put_smstate(u64, smstate, 0x7ed8, 1);
4408         /* FEE0h - SVM Guest VMCB Physical Address */
4409         put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
4410
4411         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
4412         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
4413         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
4414
4415         ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
4416         if (ret)
4417                 return ret;
4418
4419         /*
4420          * KVM uses VMCB01 to store L1 host state while L2 runs but
4421          * VMCB01 is going to be used during SMM and thus the state will
4422          * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
4423          * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
4424          * format of the area is identical to guest save area offsetted
4425          * by 0x400 (matches the offset of 'struct vmcb_save_area'
4426          * within 'struct vmcb'). Note: HSAVE area may also be used by
4427          * L1 hypervisor to save additional host context (e.g. KVM does
4428          * that, see svm_prepare_switch_to_guest()) which must be
4429          * preserved.
4430          */
4431         if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
4432                          &map_save) == -EINVAL)
4433                 return 1;
4434
4435         BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
4436
4437         svm_copy_vmrun_state(map_save.hva + 0x400,
4438                              &svm->vmcb01.ptr->save);
4439
4440         kvm_vcpu_unmap(vcpu, &map_save, true);
4441         return 0;
4442 }
4443
4444 static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
4445 {
4446         struct vcpu_svm *svm = to_svm(vcpu);
4447         struct kvm_host_map map, map_save;
4448         u64 saved_efer, vmcb12_gpa;
4449         struct vmcb *vmcb12;
4450         int ret;
4451
4452         if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
4453                 return 0;
4454
4455         /* Non-zero if SMI arrived while vCPU was in guest mode. */
4456         if (!GET_SMSTATE(u64, smstate, 0x7ed8))
4457                 return 0;
4458
4459         if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
4460                 return 1;
4461
4462         saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
4463         if (!(saved_efer & EFER_SVME))
4464                 return 1;
4465
4466         vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
4467         if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
4468                 return 1;
4469
4470         ret = 1;
4471         if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL)
4472                 goto unmap_map;
4473
4474         if (svm_allocate_nested(svm))
4475                 goto unmap_save;
4476
4477         /*
4478          * Restore L1 host state from L1 HSAVE area as VMCB01 was
4479          * used during SMM (see svm_enter_smm())
4480          */
4481
4482         svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
4483
4484         /*
4485          * Enter the nested guest now
4486          */
4487
4488         vmcb_mark_all_dirty(svm->vmcb01.ptr);
4489
4490         vmcb12 = map.hva;
4491         nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
4492         nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
4493         ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
4494
4495         if (ret)
4496                 goto unmap_save;
4497
4498         svm->nested.nested_run_pending = 1;
4499
4500 unmap_save:
4501         kvm_vcpu_unmap(vcpu, &map_save, true);
4502 unmap_map:
4503         kvm_vcpu_unmap(vcpu, &map, true);
4504         return ret;
4505 }
4506
4507 static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
4508 {
4509         struct vcpu_svm *svm = to_svm(vcpu);
4510
4511         if (!gif_set(svm)) {
4512                 if (vgif)
4513                         svm_set_intercept(svm, INTERCEPT_STGI);
4514                 /* STGI will cause a vm exit */
4515         } else {
4516                 /* We must be in SMM; RSM will cause a vmexit anyway.  */
4517         }
4518 }
4519
4520 static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
4521                                         void *insn, int insn_len)
4522 {
4523         bool smep, smap, is_user;
4524         unsigned long cr4;
4525         u64 error_code;
4526
4527         /* Emulation is always possible when KVM has access to all guest state. */
4528         if (!sev_guest(vcpu->kvm))
4529                 return true;
4530
4531         /* #UD and #GP should never be intercepted for SEV guests. */
4532         WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
4533                                   EMULTYPE_TRAP_UD_FORCED |
4534                                   EMULTYPE_VMWARE_GP));
4535
4536         /*
4537          * Emulation is impossible for SEV-ES guests as KVM doesn't have access
4538          * to guest register state.
4539          */
4540         if (sev_es_guest(vcpu->kvm))
4541                 return false;
4542
4543         /*
4544          * Emulation is possible if the instruction is already decoded, e.g.
4545          * when completing I/O after returning from userspace.
4546          */
4547         if (emul_type & EMULTYPE_NO_DECODE)
4548                 return true;
4549
4550         /*
4551          * Emulation is possible for SEV guests if and only if a prefilled
4552          * buffer containing the bytes of the intercepted instruction is
4553          * available. SEV guest memory is encrypted with a guest specific key
4554          * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
4555          * decode garbage.
4556          *
4557          * Inject #UD if KVM reached this point without an instruction buffer.
4558          * In practice, this path should never be hit by a well-behaved guest,
4559          * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
4560          * is still theoretically reachable, e.g. via unaccelerated fault-like
4561          * AVIC access, and needs to be handled by KVM to avoid putting the
4562          * guest into an infinite loop.   Injecting #UD is somewhat arbitrary,
4563          * but its the least awful option given lack of insight into the guest.
4564          */
4565         if (unlikely(!insn)) {
4566                 kvm_queue_exception(vcpu, UD_VECTOR);
4567                 return false;
4568         }
4569
4570         /*
4571          * Emulate for SEV guests if the insn buffer is not empty.  The buffer
4572          * will be empty if the DecodeAssist microcode cannot fetch bytes for
4573          * the faulting instruction because the code fetch itself faulted, e.g.
4574          * the guest attempted to fetch from emulated MMIO or a guest page
4575          * table used to translate CS:RIP resides in emulated MMIO.
4576          */
4577         if (likely(insn_len))
4578                 return true;
4579
4580         /*
4581          * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
4582          *
4583          * Errata:
4584          * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
4585          * possible that CPU microcode implementing DecodeAssist will fail to
4586          * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
4587          * be '0'.  This happens because microcode reads CS:RIP using a _data_
4588          * loap uop with CPL=0 privileges.  If the load hits a SMAP #PF, ucode
4589          * gives up and does not fill the instruction bytes buffer.
4590          *
4591          * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
4592          * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
4593          * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
4594          * GuestIntrBytes field of the VMCB.
4595          *
4596          * This does _not_ mean that the erratum has been encountered, as the
4597          * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
4598          * #PF, e.g. if the guest attempt to execute from emulated MMIO and
4599          * encountered a reserved/not-present #PF.
4600          *
4601          * To hit the erratum, the following conditions must be true:
4602          *    1. CR4.SMAP=1 (obviously).
4603          *    2. CR4.SMEP=0 || CPL=3.  If SMEP=1 and CPL<3, the erratum cannot
4604          *       have been hit as the guest would have encountered a SMEP
4605          *       violation #PF, not a #NPF.
4606          *    3. The #NPF is not due to a code fetch, in which case failure to
4607          *       retrieve the instruction bytes is legitimate (see abvoe).
4608          *
4609          * In addition, don't apply the erratum workaround if the #NPF occurred
4610          * while translating guest page tables (see below).
4611          */
4612         error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
4613         if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
4614                 goto resume_guest;
4615
4616         cr4 = kvm_read_cr4(vcpu);
4617         smep = cr4 & X86_CR4_SMEP;
4618         smap = cr4 & X86_CR4_SMAP;
4619         is_user = svm_get_cpl(vcpu) == 3;
4620         if (smap && (!smep || is_user)) {
4621                 pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
4622
4623                 /*
4624                  * If the fault occurred in userspace, arbitrarily inject #GP
4625                  * to avoid killing the guest and to hopefully avoid confusing
4626                  * the guest kernel too much, e.g. injecting #PF would not be
4627                  * coherent with respect to the guest's page tables.  Request
4628                  * triple fault if the fault occurred in the kernel as there's
4629                  * no fault that KVM can inject without confusing the guest.
4630                  * In practice, the triple fault is moot as no sane SEV kernel
4631                  * will execute from user memory while also running with SMAP=1.
4632                  */
4633                 if (is_user)
4634                         kvm_inject_gp(vcpu, 0);
4635                 else
4636                         kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4637         }
4638
4639 resume_guest:
4640         /*
4641          * If the erratum was not hit, simply resume the guest and let it fault
4642          * again.  While awful, e.g. the vCPU may get stuck in an infinite loop
4643          * if the fault is at CPL=0, it's the lesser of all evils.  Exiting to
4644          * userspace will kill the guest, and letting the emulator read garbage
4645          * will yield random behavior and potentially corrupt the guest.
4646          *
4647          * Simply resuming the guest is technically not a violation of the SEV
4648          * architecture.  AMD's APM states that all code fetches and page table
4649          * accesses for SEV guest are encrypted, regardless of the C-Bit.  The
4650          * APM also states that encrypted accesses to MMIO are "ignored", but
4651          * doesn't explicitly define "ignored", i.e. doing nothing and letting
4652          * the guest spin is technically "ignoring" the access.
4653          */
4654         return false;
4655 }
4656
4657 static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
4658 {
4659         struct vcpu_svm *svm = to_svm(vcpu);
4660
4661         /*
4662          * TODO: Last condition latch INIT signals on vCPU when
4663          * vCPU is in guest-mode and vmcb12 defines intercept on INIT.
4664          * To properly emulate the INIT intercept,
4665          * svm_check_nested_events() should call nested_svm_vmexit()
4666          * if an INIT signal is pending.
4667          */
4668         return !gif_set(svm) ||
4669                    (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT));
4670 }
4671
4672 static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
4673 {
4674         if (!sev_es_guest(vcpu->kvm))
4675                 return kvm_vcpu_deliver_sipi_vector(vcpu, vector);
4676
4677         sev_vcpu_deliver_sipi_vector(vcpu, vector);
4678 }
4679
4680 static void svm_vm_destroy(struct kvm *kvm)
4681 {
4682         avic_vm_destroy(kvm);
4683         sev_vm_destroy(kvm);
4684 }
4685
4686 static int svm_vm_init(struct kvm *kvm)
4687 {
4688         if (!pause_filter_count || !pause_filter_thresh)
4689                 kvm->arch.pause_in_guest = true;
4690
4691         if (enable_apicv) {
4692                 int ret = avic_vm_init(kvm);
4693                 if (ret)
4694                         return ret;
4695         }
4696
4697         return 0;
4698 }
4699
4700 static struct kvm_x86_ops svm_x86_ops __initdata = {
4701         .name = "kvm_amd",
4702
4703         .hardware_unsetup = svm_hardware_unsetup,
4704         .hardware_enable = svm_hardware_enable,
4705         .hardware_disable = svm_hardware_disable,
4706         .has_emulated_msr = svm_has_emulated_msr,
4707
4708         .vcpu_create = svm_vcpu_create,
4709         .vcpu_free = svm_vcpu_free,
4710         .vcpu_reset = svm_vcpu_reset,
4711
4712         .vm_size = sizeof(struct kvm_svm),
4713         .vm_init = svm_vm_init,
4714         .vm_destroy = svm_vm_destroy,
4715
4716         .prepare_switch_to_guest = svm_prepare_switch_to_guest,
4717         .vcpu_load = svm_vcpu_load,
4718         .vcpu_put = svm_vcpu_put,
4719         .vcpu_blocking = avic_vcpu_blocking,
4720         .vcpu_unblocking = avic_vcpu_unblocking,
4721
4722         .update_exception_bitmap = svm_update_exception_bitmap,
4723         .get_msr_feature = svm_get_msr_feature,
4724         .get_msr = svm_get_msr,
4725         .set_msr = svm_set_msr,
4726         .get_segment_base = svm_get_segment_base,
4727         .get_segment = svm_get_segment,
4728         .set_segment = svm_set_segment,
4729         .get_cpl = svm_get_cpl,
4730         .get_cs_db_l_bits = svm_get_cs_db_l_bits,
4731         .set_cr0 = svm_set_cr0,
4732         .post_set_cr3 = sev_post_set_cr3,
4733         .is_valid_cr4 = svm_is_valid_cr4,
4734         .set_cr4 = svm_set_cr4,
4735         .set_efer = svm_set_efer,
4736         .get_idt = svm_get_idt,
4737         .set_idt = svm_set_idt,
4738         .get_gdt = svm_get_gdt,
4739         .set_gdt = svm_set_gdt,
4740         .set_dr7 = svm_set_dr7,
4741         .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
4742         .cache_reg = svm_cache_reg,
4743         .get_rflags = svm_get_rflags,
4744         .set_rflags = svm_set_rflags,
4745         .get_if_flag = svm_get_if_flag,
4746
4747         .flush_tlb_all = svm_flush_tlb_current,
4748         .flush_tlb_current = svm_flush_tlb_current,
4749         .flush_tlb_gva = svm_flush_tlb_gva,
4750         .flush_tlb_guest = svm_flush_tlb_current,
4751
4752         .vcpu_pre_run = svm_vcpu_pre_run,
4753         .vcpu_run = svm_vcpu_run,
4754         .handle_exit = svm_handle_exit,
4755         .skip_emulated_instruction = svm_skip_emulated_instruction,
4756         .update_emulated_instruction = NULL,
4757         .set_interrupt_shadow = svm_set_interrupt_shadow,
4758         .get_interrupt_shadow = svm_get_interrupt_shadow,
4759         .patch_hypercall = svm_patch_hypercall,
4760         .inject_irq = svm_inject_irq,
4761         .inject_nmi = svm_inject_nmi,
4762         .queue_exception = svm_queue_exception,
4763         .cancel_injection = svm_cancel_injection,
4764         .interrupt_allowed = svm_interrupt_allowed,
4765         .nmi_allowed = svm_nmi_allowed,
4766         .get_nmi_mask = svm_get_nmi_mask,
4767         .set_nmi_mask = svm_set_nmi_mask,
4768         .enable_nmi_window = svm_enable_nmi_window,
4769         .enable_irq_window = svm_enable_irq_window,
4770         .update_cr8_intercept = svm_update_cr8_intercept,
4771         .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
4772         .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
4773         .apicv_post_state_restore = avic_apicv_post_state_restore,
4774
4775         .get_mt_mask = svm_get_mt_mask,
4776         .get_exit_info = svm_get_exit_info,
4777
4778         .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid,
4779
4780         .has_wbinvd_exit = svm_has_wbinvd_exit,
4781
4782         .get_l2_tsc_offset = svm_get_l2_tsc_offset,
4783         .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
4784         .write_tsc_offset = svm_write_tsc_offset,
4785         .write_tsc_multiplier = svm_write_tsc_multiplier,
4786
4787         .load_mmu_pgd = svm_load_mmu_pgd,
4788
4789         .check_intercept = svm_check_intercept,
4790         .handle_exit_irqoff = svm_handle_exit_irqoff,
4791
4792         .request_immediate_exit = __kvm_request_immediate_exit,
4793
4794         .sched_in = svm_sched_in,
4795
4796         .nested_ops = &svm_nested_ops,
4797
4798         .deliver_interrupt = svm_deliver_interrupt,
4799         .pi_update_irte = avic_pi_update_irte,
4800         .setup_mce = svm_setup_mce,
4801
4802         .smi_allowed = svm_smi_allowed,
4803         .enter_smm = svm_enter_smm,
4804         .leave_smm = svm_leave_smm,
4805         .enable_smi_window = svm_enable_smi_window,
4806
4807         .mem_enc_ioctl = sev_mem_enc_ioctl,
4808         .mem_enc_register_region = sev_mem_enc_register_region,
4809         .mem_enc_unregister_region = sev_mem_enc_unregister_region,
4810         .guest_memory_reclaimed = sev_guest_memory_reclaimed,
4811
4812         .vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
4813         .vm_move_enc_context_from = sev_vm_move_enc_context_from,
4814
4815         .can_emulate_instruction = svm_can_emulate_instruction,
4816
4817         .apic_init_signal_blocked = svm_apic_init_signal_blocked,
4818
4819         .msr_filter_changed = svm_msr_filter_changed,
4820         .complete_emulated_msr = svm_complete_emulated_msr,
4821
4822         .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
4823         .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
4824 };
4825
4826 /*
4827  * The default MMIO mask is a single bit (excluding the present bit),
4828  * which could conflict with the memory encryption bit. Check for
4829  * memory encryption support and override the default MMIO mask if
4830  * memory encryption is enabled.
4831  */
4832 static __init void svm_adjust_mmio_mask(void)
4833 {
4834         unsigned int enc_bit, mask_bit;
4835         u64 msr, mask;
4836
4837         /* If there is no memory encryption support, use existing mask */
4838         if (cpuid_eax(0x80000000) < 0x8000001f)
4839                 return;
4840
4841         /* If memory encryption is not enabled, use existing mask */
4842         rdmsrl(MSR_AMD64_SYSCFG, msr);
4843         if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
4844                 return;
4845
4846         enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
4847         mask_bit = boot_cpu_data.x86_phys_bits;
4848
4849         /* Increment the mask bit if it is the same as the encryption bit */
4850         if (enc_bit == mask_bit)
4851                 mask_bit++;
4852
4853         /*
4854          * If the mask bit location is below 52, then some bits above the
4855          * physical addressing limit will always be reserved, so use the
4856          * rsvd_bits() function to generate the mask. This mask, along with
4857          * the present bit, will be used to generate a page fault with
4858          * PFER.RSV = 1.
4859          *
4860          * If the mask bit location is 52 (or above), then clear the mask.
4861          */
4862         mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
4863
4864         kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
4865 }
4866
4867 static __init void svm_set_cpu_caps(void)
4868 {
4869         kvm_set_cpu_caps();
4870
4871         kvm_caps.supported_xss = 0;
4872
4873         /* CPUID 0x80000001 and 0x8000000A (SVM features) */
4874         if (nested) {
4875                 kvm_cpu_cap_set(X86_FEATURE_SVM);
4876                 kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN);
4877
4878                 if (nrips)
4879                         kvm_cpu_cap_set(X86_FEATURE_NRIPS);
4880
4881                 if (npt_enabled)
4882                         kvm_cpu_cap_set(X86_FEATURE_NPT);
4883
4884                 if (tsc_scaling)
4885                         kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
4886
4887                 if (vls)
4888                         kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
4889                 if (lbrv)
4890                         kvm_cpu_cap_set(X86_FEATURE_LBRV);
4891
4892                 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
4893                         kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
4894
4895                 if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
4896                         kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
4897
4898                 if (vgif)
4899                         kvm_cpu_cap_set(X86_FEATURE_VGIF);
4900
4901                 /* Nested VM can receive #VMEXIT instead of triggering #GP */
4902                 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
4903         }
4904
4905         /* CPUID 0x80000008 */
4906         if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
4907             boot_cpu_has(X86_FEATURE_AMD_SSBD))
4908                 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
4909
4910         /* AMD PMU PERFCTR_CORE CPUID */
4911         if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
4912                 kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
4913
4914         /* CPUID 0x8000001F (SME/SEV features) */
4915         sev_set_cpu_caps();
4916 }
4917
4918 static __init int svm_hardware_setup(void)
4919 {
4920         int cpu;
4921         struct page *iopm_pages;
4922         void *iopm_va;
4923         int r;
4924         unsigned int order = get_order(IOPM_SIZE);
4925
4926         /*
4927          * NX is required for shadow paging and for NPT if the NX huge pages
4928          * mitigation is enabled.
4929          */
4930         if (!boot_cpu_has(X86_FEATURE_NX)) {
4931                 pr_err_ratelimited("NX (Execute Disable) not supported\n");
4932                 return -EOPNOTSUPP;
4933         }
4934         kvm_enable_efer_bits(EFER_NX);
4935
4936         iopm_pages = alloc_pages(GFP_KERNEL, order);
4937
4938         if (!iopm_pages)
4939                 return -ENOMEM;
4940
4941         iopm_va = page_address(iopm_pages);
4942         memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
4943         iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
4944
4945         init_msrpm_offsets();
4946
4947         kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
4948                                      XFEATURE_MASK_BNDCSR);
4949
4950         if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
4951                 kvm_enable_efer_bits(EFER_FFXSR);
4952
4953         if (tsc_scaling) {
4954                 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
4955                         tsc_scaling = false;
4956                 } else {
4957                         pr_info("TSC scaling supported\n");
4958                         kvm_caps.has_tsc_control = true;
4959                 }
4960         }
4961         kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX;
4962         kvm_caps.tsc_scaling_ratio_frac_bits = 32;
4963
4964         tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
4965
4966         /* Check for pause filtering support */
4967         if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
4968                 pause_filter_count = 0;
4969                 pause_filter_thresh = 0;
4970         } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
4971                 pause_filter_thresh = 0;
4972         }
4973
4974         if (nested) {
4975                 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
4976                 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
4977         }
4978
4979         /*
4980          * KVM's MMU doesn't support using 2-level paging for itself, and thus
4981          * NPT isn't supported if the host is using 2-level paging since host
4982          * CR4 is unchanged on VMRUN.
4983          */
4984         if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
4985                 npt_enabled = false;
4986
4987         if (!boot_cpu_has(X86_FEATURE_NPT))
4988                 npt_enabled = false;
4989
4990         /* Force VM NPT level equal to the host's paging level */
4991         kvm_configure_mmu(npt_enabled, get_npt_level(),
4992                           get_npt_level(), PG_LEVEL_1G);
4993         pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
4994
4995         /* Setup shadow_me_value and shadow_me_mask */
4996         kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
4997
4998         /* Note, SEV setup consumes npt_enabled. */
4999         sev_hardware_setup();
5000
5001         svm_hv_hardware_setup();
5002
5003         svm_adjust_mmio_mask();
5004
5005         for_each_possible_cpu(cpu) {
5006                 r = svm_cpu_init(cpu);
5007                 if (r)
5008                         goto err;
5009         }
5010
5011         if (nrips) {
5012                 if (!boot_cpu_has(X86_FEATURE_NRIPS))
5013                         nrips = false;
5014         }
5015
5016         enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops);
5017
5018         if (!enable_apicv) {
5019                 svm_x86_ops.vcpu_blocking = NULL;
5020                 svm_x86_ops.vcpu_unblocking = NULL;
5021                 svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
5022         }
5023
5024         if (vls) {
5025                 if (!npt_enabled ||
5026                     !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
5027                     !IS_ENABLED(CONFIG_X86_64)) {
5028                         vls = false;
5029                 } else {
5030                         pr_info("Virtual VMLOAD VMSAVE supported\n");
5031                 }
5032         }
5033
5034         if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
5035                 svm_gp_erratum_intercept = false;
5036
5037         if (vgif) {
5038                 if (!boot_cpu_has(X86_FEATURE_VGIF))
5039                         vgif = false;
5040                 else
5041                         pr_info("Virtual GIF supported\n");
5042         }
5043
5044         if (lbrv) {
5045                 if (!boot_cpu_has(X86_FEATURE_LBRV))
5046                         lbrv = false;
5047                 else
5048                         pr_info("LBR virtualization supported\n");
5049         }
5050
5051         if (!enable_pmu)
5052                 pr_info("PMU virtualization is disabled\n");
5053
5054         svm_set_cpu_caps();
5055
5056         /*
5057          * It seems that on AMD processors PTE's accessed bit is
5058          * being set by the CPU hardware before the NPF vmexit.
5059          * This is not expected behaviour and our tests fail because
5060          * of it.
5061          * A workaround here is to disable support for
5062          * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
5063          * In this case userspace can know if there is support using
5064          * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
5065          * it
5066          * If future AMD CPU models change the behaviour described above,
5067          * this variable can be changed accordingly
5068          */
5069         allow_smaller_maxphyaddr = !npt_enabled;
5070
5071         return 0;
5072
5073 err:
5074         svm_hardware_unsetup();
5075         return r;
5076 }
5077
5078
5079 static struct kvm_x86_init_ops svm_init_ops __initdata = {
5080         .cpu_has_kvm_support = has_svm,
5081         .disabled_by_bios = is_disabled,
5082         .hardware_setup = svm_hardware_setup,
5083         .check_processor_compatibility = svm_check_processor_compat,
5084
5085         .runtime_ops = &svm_x86_ops,
5086         .pmu_ops = &amd_pmu_ops,
5087 };
5088
5089 static int __init svm_init(void)
5090 {
5091         __unused_size_checks();
5092
5093         return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
5094                         __alignof__(struct vcpu_svm), THIS_MODULE);
5095 }
5096
5097 static void __exit svm_exit(void)
5098 {
5099         kvm_exit();
5100 }
5101
5102 module_init(svm_init)
5103 module_exit(svm_exit)