Merge v5.14-rc3 into usb-next

[linux-2.6-microblaze.git] / arch / x86 / kvm / svm / svm.c
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c

index e088086..664d20f 100644 (file)
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -43,6 +43,9 @@
  #include "svm.h"
  #include "svm_ops.h"
  
+#include "kvm_onhyperv.h"
+#include "svm_onhyperv.h"
+
  #define __ex(x) __kvm_handle_fault_on_reboot(x)
  
  MODULE_AUTHOR("Qumranet");
@@ -185,9 +188,21 @@ module_param(vls, int, 0444);
  static int vgif = true;
  module_param(vgif, int, 0444);
  
+/*
+ * enable / disable AVIC.  Because the defaults differ for APICv
+ * support between VMX and SVM we cannot use module_param_named.
+ */
+static bool avic;
+module_param(avic, bool, 0444);
+
  bool __read_mostly dump_invalid_vmcb;
  module_param(dump_invalid_vmcb, bool, 0644);
  
+
+bool intercept_smi = true;
+module_param(intercept_smi, bool, 0444);
+
+
  static bool svm_gp_erratum_intercept = true;
  
  static u8 rsm_ins_bytes[] = "\x0f\xaa";
@@ -673,6 +688,9 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
         write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
  
         msrpm[offset] = tmp;
+
+       svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
+
  }
  
  void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
@@ -939,6 +957,16 @@ static __init int svm_hardware_setup(void)
         int r;
         unsigned int order = get_order(IOPM_SIZE);
  
+       /*
+        * NX is required for shadow paging and for NPT if the NX huge pages
+        * mitigation is enabled.
+        */
+       if (!boot_cpu_has(X86_FEATURE_NX)) {
+               pr_err_ratelimited("NX (Execute Disable) not supported\n");
+               return -EOPNOTSUPP;
+       }
+       kvm_enable_efer_bits(EFER_NX);
+
         iopm_pages = alloc_pages(GFP_KERNEL, order);
  
         if (!iopm_pages)
@@ -952,9 +980,6 @@ static __init int svm_hardware_setup(void)
  
         supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
  
-       if (boot_cpu_has(X86_FEATURE_NX))
-               kvm_enable_efer_bits(EFER_NX);
-
         if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
                 kvm_enable_efer_bits(EFER_FFXSR);
  
@@ -996,6 +1021,8 @@ static __init int svm_hardware_setup(void)
         /* Note, SEV setup consumes npt_enabled. */
         sev_hardware_setup();
  
+       svm_hv_hardware_setup();
+
         svm_adjust_mmio_mask();
  
         for_each_possible_cpu(cpu) {
@@ -1009,14 +1036,12 @@ static __init int svm_hardware_setup(void)
                         nrips = false;
         }
  
-       if (avic) {
-               if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)) {
-                       avic = false;
-               } else {
-                       pr_info("AVIC enabled\n");
+       enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
  
-                       amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
-               }
+       if (enable_apicv) {
+               pr_info("AVIC enabled\n");
+
+               amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
         }
  
         if (vls) {
@@ -1080,26 +1105,30 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
         seg->base = 0;
  }
  
-static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
-       u64 g_tsc_offset = 0;
  
-       if (is_guest_mode(vcpu)) {
-               /* Write L1's TSC offset.  */
-               g_tsc_offset = svm->vmcb->control.tsc_offset -
-                              svm->vmcb01.ptr->control.tsc_offset;
-               svm->vmcb01.ptr->control.tsc_offset = offset;
-       }
+       return svm->nested.ctl.tsc_offset;
+}
  
-       trace_kvm_write_tsc_offset(vcpu->vcpu_id,
-                                  svm->vmcb->control.tsc_offset - g_tsc_offset,
-                                  offset);
+static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+       return kvm_default_tsc_scaling_ratio;
+}
  
-       svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
+static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
  
+       svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
+       svm->vmcb->control.tsc_offset = offset;
         vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
-       return svm->vmcb->control.tsc_offset;
+}
+
+static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+{
+       wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
  }
  
  /* Evaluate instruction intercepts that depend on guest CPUID features. */
@@ -1161,7 +1190,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
  
         svm_set_intercept(svm, INTERCEPT_INTR);
         svm_set_intercept(svm, INTERCEPT_NMI);
-       svm_set_intercept(svm, INTERCEPT_SMI);
+
+       if (intercept_smi)
+               svm_set_intercept(svm, INTERCEPT_SMI);
+
         svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
         svm_set_intercept(svm, INTERCEPT_RDPMC);
         svm_set_intercept(svm, INTERCEPT_CPUID);
@@ -1287,6 +1319,8 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
                 }
         }
  
+       svm_hv_init_vmcb(svm->vmcb);
+
         vmcb_mark_all_dirty(svm->vmcb);
  
         enable_gif(svm);
@@ -1897,7 +1931,7 @@ static int npf_interception(struct kvm_vcpu *vcpu)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
  
-       u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
+       u64 fault_address = svm->vmcb->control.exit_info_2;
         u64 error_code = svm->vmcb->control.exit_info_1;
  
         trace_kvm_page_fault(fault_address, error_code);
@@ -2080,6 +2114,11 @@ static int nmi_interception(struct kvm_vcpu *vcpu)
         return 1;
  }
  
+static int smi_interception(struct kvm_vcpu *vcpu)
+{
+       return 1;
+}
+
  static int intr_interception(struct kvm_vcpu *vcpu)
  {
         ++vcpu->stat.irq_exits;
@@ -2915,7 +2954,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                         svm_disable_lbrv(vcpu);
                 break;
         case MSR_VM_HSAVE_PA:
-               svm->nested.hsave_msr = data;
+               /*
+                * Old kernels did not validate the value written to
+                * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
+                * value to allow live migrating buggy or malicious guests
+                * originating from those kernels.
+                */
+               if (!msr->host_initiated && !page_address_valid(vcpu, data))
+                       return 1;
+
+               svm->nested.hsave_msr = data & PAGE_MASK;
                 break;
         case MSR_VM_CR:
                 return svm_set_vm_cr(vcpu, data);
@@ -3054,8 +3102,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
         [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
         [SVM_EXIT_INTR]                         = intr_interception,
         [SVM_EXIT_NMI]                          = nmi_interception,
-       [SVM_EXIT_SMI]                          = kvm_emulate_as_nop,
-       [SVM_EXIT_INIT]                         = kvm_emulate_as_nop,
+       [SVM_EXIT_SMI]                          = smi_interception,
         [SVM_EXIT_VINTR]                        = interrupt_window_interception,
         [SVM_EXIT_RDPMC]                        = kvm_emulate_rdpmc,
         [SVM_EXIT_CPUID]                        = kvm_emulate_cpuid,
@@ -3106,6 +3153,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
                 return;
         }
  
+       pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
+              svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
         pr_err("VMCB Control Area:\n");
         pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
         pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
@@ -3762,6 +3811,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         }
         svm->vmcb->save.cr2 = vcpu->arch.cr2;
  
+       svm_hv_update_vp_id(svm->vmcb, vcpu);
+
         /*
          * Run with all-zero DR6 unless needed, so that we can get the exact cause
          * of a #DB.
@@ -3835,6 +3886,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
         svm->next_rip = 0;
         if (is_guest_mode(vcpu)) {
                 nested_sync_control_from_vmcb02(svm);
+
+               /* Track VMRUNs that have made past consistency checking */
+               if (svm->nested.nested_run_pending &&
+                   svm->vmcb->control.exit_code != SVM_EXIT_ERR)
+                        ++vcpu->stat.nested_run;
+
                 svm->nested.nested_run_pending = 0;
         }
  
@@ -3846,10 +3903,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
                 vcpu->arch.apf.host_apf_flags =
                         kvm_read_and_reset_apf_flags();
  
-       if (npt_enabled) {
-               vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
-               vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
-       }
+       if (npt_enabled)
+               kvm_register_clear_available(vcpu, VCPU_EXREG_PDPTR);
  
         /*
          * We need to handle MC intercepts here before the vcpu has a chance to
@@ -3877,6 +3932,8 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
                 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
                 vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
  
+               hv_track_root_tdp(vcpu, root_hpa);
+
                 /* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
                 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
                         return;
@@ -4249,9 +4306,10 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
         return !svm_smi_blocked(vcpu);
  }
  
-static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
+       struct kvm_host_map map_save;
         int ret;
  
         if (is_guest_mode(vcpu)) {
@@ -4267,20 +4325,44 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
                 ret = nested_svm_vmexit(svm);
                 if (ret)
                         return ret;
+
+               /*
+                * KVM uses VMCB01 to store L1 host state while L2 runs but
+                * VMCB01 is going to be used during SMM and thus the state will
+                * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
+                * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
+                * format of the area is identical to guest save area offsetted
+                * by 0x400 (matches the offset of 'struct vmcb_save_area'
+                * within 'struct vmcb'). Note: HSAVE area may also be used by
+                * L1 hypervisor to save additional host context (e.g. KVM does
+                * that, see svm_prepare_guest_switch()) which must be
+                * preserved.
+                */
+               if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
+                                &map_save) == -EINVAL)
+                       return 1;
+
+               BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
+
+               svm_copy_vmrun_state(&svm->vmcb01.ptr->save,
+                                    map_save.hva + 0x400);
+
+               kvm_vcpu_unmap(vcpu, &map_save, true);
         }
         return 0;
  }
  
-static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
  {
         struct vcpu_svm *svm = to_svm(vcpu);
-       struct kvm_host_map map;
+       struct kvm_host_map map, map_save;
         int ret = 0;
  
         if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
                 u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
                 u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8);
                 u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
+               struct vmcb *vmcb12;
  
                 if (guest) {
                         if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
@@ -4296,8 +4378,25 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
                         if (svm_allocate_nested(svm))
                                 return 1;
  
-                       ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, map.hva);
+                       vmcb12 = map.hva;
+
+                       nested_load_control_from_vmcb12(svm, &vmcb12->control);
+
+                       ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12);
                         kvm_vcpu_unmap(vcpu, &map, true);
+
+                       /*
+                        * Restore L1 host state from L1 HSAVE area as VMCB01 was
+                        * used during SMM (see svm_enter_smm())
+                        */
+                       if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
+                                        &map_save) == -EINVAL)
+                               return 1;
+
+                       svm_copy_vmrun_state(map_save.hva + 0x400,
+                                            &svm->vmcb01.ptr->save);
+
+                       kvm_vcpu_unmap(vcpu, &map_save, true);
                 }
         }
  
@@ -4427,13 +4526,12 @@ static int svm_vm_init(struct kvm *kvm)
         if (!pause_filter_count || !pause_filter_thresh)
                 kvm->arch.pause_in_guest = true;
  
-       if (avic) {
+       if (enable_apicv) {
                 int ret = avic_vm_init(kvm);
                 if (ret)
                         return ret;
         }
  
-       kvm_apicv_init(kvm, avic);
         return 0;
  }
  
@@ -4524,7 +4622,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
  
         .has_wbinvd_exit = svm_has_wbinvd_exit,
  
-       .write_l1_tsc_offset = svm_write_l1_tsc_offset,
+       .get_l2_tsc_offset = svm_get_l2_tsc_offset,
+       .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
+       .write_tsc_offset = svm_write_tsc_offset,
+       .write_tsc_multiplier = svm_write_tsc_multiplier,
  
         .load_mmu_pgd = svm_load_mmu_pgd,
  
@@ -4544,8 +4645,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
         .setup_mce = svm_setup_mce,
  
         .smi_allowed = svm_smi_allowed,
-       .pre_enter_smm = svm_pre_enter_smm,
-       .pre_leave_smm = svm_pre_leave_smm,
+       .enter_smm = svm_enter_smm,
+       .leave_smm = svm_leave_smm,
         .enable_smi_window = svm_enable_smi_window,
  
         .mem_enc_op = svm_mem_enc_op,