Merge v5.14-rc3 into usb-next
[linux-2.6-microblaze.git] / arch / x86 / kvm / svm / svm.c
index e088086..664d20f 100644 (file)
@@ -43,6 +43,9 @@
 #include "svm.h"
 #include "svm_ops.h"
 
+#include "kvm_onhyperv.h"
+#include "svm_onhyperv.h"
+
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
 MODULE_AUTHOR("Qumranet");
@@ -185,9 +188,21 @@ module_param(vls, int, 0444);
 static int vgif = true;
 module_param(vgif, int, 0444);
 
+/*
+ * enable / disable AVIC.  Because the defaults differ for APICv
+ * support between VMX and SVM we cannot use module_param_named.
+ */
+static bool avic;
+module_param(avic, bool, 0444);
+
 bool __read_mostly dump_invalid_vmcb;
 module_param(dump_invalid_vmcb, bool, 0644);
 
+
+bool intercept_smi = true;
+module_param(intercept_smi, bool, 0444);
+
+
 static bool svm_gp_erratum_intercept = true;
 
 static u8 rsm_ins_bytes[] = "\x0f\xaa";
@@ -673,6 +688,9 @@ static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm,
        write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
 
        msrpm[offset] = tmp;
+
+       svm_hv_vmcb_dirty_nested_enlightenments(vcpu);
+
 }
 
 void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr,
@@ -939,6 +957,16 @@ static __init int svm_hardware_setup(void)
        int r;
        unsigned int order = get_order(IOPM_SIZE);
 
+       /*
+        * NX is required for shadow paging and for NPT if the NX huge pages
+        * mitigation is enabled.
+        */
+       if (!boot_cpu_has(X86_FEATURE_NX)) {
+               pr_err_ratelimited("NX (Execute Disable) not supported\n");
+               return -EOPNOTSUPP;
+       }
+       kvm_enable_efer_bits(EFER_NX);
+
        iopm_pages = alloc_pages(GFP_KERNEL, order);
 
        if (!iopm_pages)
@@ -952,9 +980,6 @@ static __init int svm_hardware_setup(void)
 
        supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
 
-       if (boot_cpu_has(X86_FEATURE_NX))
-               kvm_enable_efer_bits(EFER_NX);
-
        if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
                kvm_enable_efer_bits(EFER_FFXSR);
 
@@ -996,6 +1021,8 @@ static __init int svm_hardware_setup(void)
        /* Note, SEV setup consumes npt_enabled. */
        sev_hardware_setup();
 
+       svm_hv_hardware_setup();
+
        svm_adjust_mmio_mask();
 
        for_each_possible_cpu(cpu) {
@@ -1009,14 +1036,12 @@ static __init int svm_hardware_setup(void)
                        nrips = false;
        }
 
-       if (avic) {
-               if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC)) {
-                       avic = false;
-               } else {
-                       pr_info("AVIC enabled\n");
+       enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
 
-                       amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
-               }
+       if (enable_apicv) {
+               pr_info("AVIC enabled\n");
+
+               amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
        }
 
        if (vls) {
@@ -1080,26 +1105,30 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
        seg->base = 0;
 }
 
-static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       u64 g_tsc_offset = 0;
 
-       if (is_guest_mode(vcpu)) {
-               /* Write L1's TSC offset.  */
-               g_tsc_offset = svm->vmcb->control.tsc_offset -
-                              svm->vmcb01.ptr->control.tsc_offset;
-               svm->vmcb01.ptr->control.tsc_offset = offset;
-       }
+       return svm->nested.ctl.tsc_offset;
+}
 
-       trace_kvm_write_tsc_offset(vcpu->vcpu_id,
-                                  svm->vmcb->control.tsc_offset - g_tsc_offset,
-                                  offset);
+static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+       return kvm_default_tsc_scaling_ratio;
+}
 
-       svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
+static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
 
+       svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset;
+       svm->vmcb->control.tsc_offset = offset;
        vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
-       return svm->vmcb->control.tsc_offset;
+}
+
+static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+{
+       wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
 }
 
 /* Evaluate instruction intercepts that depend on guest CPUID features. */
@@ -1161,7 +1190,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
 
        svm_set_intercept(svm, INTERCEPT_INTR);
        svm_set_intercept(svm, INTERCEPT_NMI);
-       svm_set_intercept(svm, INTERCEPT_SMI);
+
+       if (intercept_smi)
+               svm_set_intercept(svm, INTERCEPT_SMI);
+
        svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
        svm_set_intercept(svm, INTERCEPT_RDPMC);
        svm_set_intercept(svm, INTERCEPT_CPUID);
@@ -1287,6 +1319,8 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
                }
        }
 
+       svm_hv_init_vmcb(svm->vmcb);
+
        vmcb_mark_all_dirty(svm->vmcb);
 
        enable_gif(svm);
@@ -1897,7 +1931,7 @@ static int npf_interception(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
+       u64 fault_address = svm->vmcb->control.exit_info_2;
        u64 error_code = svm->vmcb->control.exit_info_1;
 
        trace_kvm_page_fault(fault_address, error_code);
@@ -2080,6 +2114,11 @@ static int nmi_interception(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int smi_interception(struct kvm_vcpu *vcpu)
+{
+       return 1;
+}
+
 static int intr_interception(struct kvm_vcpu *vcpu)
 {
        ++vcpu->stat.irq_exits;
@@ -2915,7 +2954,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                        svm_disable_lbrv(vcpu);
                break;
        case MSR_VM_HSAVE_PA:
-               svm->nested.hsave_msr = data;
+               /*
+                * Old kernels did not validate the value written to
+                * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
+                * value to allow live migrating buggy or malicious guests
+                * originating from those kernels.
+                */
+               if (!msr->host_initiated && !page_address_valid(vcpu, data))
+                       return 1;
+
+               svm->nested.hsave_msr = data & PAGE_MASK;
                break;
        case MSR_VM_CR:
                return svm_set_vm_cr(vcpu, data);
@@ -3054,8 +3102,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
        [SVM_EXIT_INTR]                         = intr_interception,
        [SVM_EXIT_NMI]                          = nmi_interception,
-       [SVM_EXIT_SMI]                          = kvm_emulate_as_nop,
-       [SVM_EXIT_INIT]                         = kvm_emulate_as_nop,
+       [SVM_EXIT_SMI]                          = smi_interception,
        [SVM_EXIT_VINTR]                        = interrupt_window_interception,
        [SVM_EXIT_RDPMC]                        = kvm_emulate_rdpmc,
        [SVM_EXIT_CPUID]                        = kvm_emulate_cpuid,
@@ -3106,6 +3153,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
                return;
        }
 
+       pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
+              svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
        pr_err("VMCB Control Area:\n");
        pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
        pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
@@ -3762,6 +3811,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
        }
        svm->vmcb->save.cr2 = vcpu->arch.cr2;
 
+       svm_hv_update_vp_id(svm->vmcb, vcpu);
+
        /*
         * Run with all-zero DR6 unless needed, so that we can get the exact cause
         * of a #DB.
@@ -3835,6 +3886,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
        svm->next_rip = 0;
        if (is_guest_mode(vcpu)) {
                nested_sync_control_from_vmcb02(svm);
+
+               /* Track VMRUNs that have made past consistency checking */
+               if (svm->nested.nested_run_pending &&
+                   svm->vmcb->control.exit_code != SVM_EXIT_ERR)
+                        ++vcpu->stat.nested_run;
+
                svm->nested.nested_run_pending = 0;
        }
 
@@ -3846,10 +3903,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
                vcpu->arch.apf.host_apf_flags =
                        kvm_read_and_reset_apf_flags();
 
-       if (npt_enabled) {
-               vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
-               vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
-       }
+       if (npt_enabled)
+               kvm_register_clear_available(vcpu, VCPU_EXREG_PDPTR);
 
        /*
         * We need to handle MC intercepts here before the vcpu has a chance to
@@ -3877,6 +3932,8 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
                svm->vmcb->control.nested_cr3 = __sme_set(root_hpa);
                vmcb_mark_dirty(svm->vmcb, VMCB_NPT);
 
+               hv_track_root_tdp(vcpu, root_hpa);
+
                /* Loading L2's CR3 is handled by enter_svm_guest_mode.  */
                if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
                        return;
@@ -4249,9 +4306,10 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
        return !svm_smi_blocked(vcpu);
 }
 
-static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
+       struct kvm_host_map map_save;
        int ret;
 
        if (is_guest_mode(vcpu)) {
@@ -4267,20 +4325,44 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
                ret = nested_svm_vmexit(svm);
                if (ret)
                        return ret;
+
+               /*
+                * KVM uses VMCB01 to store L1 host state while L2 runs but
+                * VMCB01 is going to be used during SMM and thus the state will
+                * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
+                * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
+                * format of the area is identical to guest save area offsetted
+                * by 0x400 (matches the offset of 'struct vmcb_save_area'
+                * within 'struct vmcb'). Note: HSAVE area may also be used by
+                * L1 hypervisor to save additional host context (e.g. KVM does
+                * that, see svm_prepare_guest_switch()) which must be
+                * preserved.
+                */
+               if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
+                                &map_save) == -EINVAL)
+                       return 1;
+
+               BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
+
+               svm_copy_vmrun_state(&svm->vmcb01.ptr->save,
+                                    map_save.hva + 0x400);
+
+               kvm_vcpu_unmap(vcpu, &map_save, true);
        }
        return 0;
 }
 
-static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       struct kvm_host_map map;
+       struct kvm_host_map map, map_save;
        int ret = 0;
 
        if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
                u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
                u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8);
                u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
+               struct vmcb *vmcb12;
 
                if (guest) {
                        if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
@@ -4296,8 +4378,25 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
                        if (svm_allocate_nested(svm))
                                return 1;
 
-                       ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, map.hva);
+                       vmcb12 = map.hva;
+
+                       nested_load_control_from_vmcb12(svm, &vmcb12->control);
+
+                       ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12);
                        kvm_vcpu_unmap(vcpu, &map, true);
+
+                       /*
+                        * Restore L1 host state from L1 HSAVE area as VMCB01 was
+                        * used during SMM (see svm_enter_smm())
+                        */
+                       if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
+                                        &map_save) == -EINVAL)
+                               return 1;
+
+                       svm_copy_vmrun_state(map_save.hva + 0x400,
+                                            &svm->vmcb01.ptr->save);
+
+                       kvm_vcpu_unmap(vcpu, &map_save, true);
                }
        }
 
@@ -4427,13 +4526,12 @@ static int svm_vm_init(struct kvm *kvm)
        if (!pause_filter_count || !pause_filter_thresh)
                kvm->arch.pause_in_guest = true;
 
-       if (avic) {
+       if (enable_apicv) {
                int ret = avic_vm_init(kvm);
                if (ret)
                        return ret;
        }
 
-       kvm_apicv_init(kvm, avic);
        return 0;
 }
 
@@ -4524,7 +4622,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 
        .has_wbinvd_exit = svm_has_wbinvd_exit,
 
-       .write_l1_tsc_offset = svm_write_l1_tsc_offset,
+       .get_l2_tsc_offset = svm_get_l2_tsc_offset,
+       .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier,
+       .write_tsc_offset = svm_write_tsc_offset,
+       .write_tsc_multiplier = svm_write_tsc_multiplier,
 
        .load_mmu_pgd = svm_load_mmu_pgd,
 
@@ -4544,8 +4645,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
        .setup_mce = svm_setup_mce,
 
        .smi_allowed = svm_smi_allowed,
-       .pre_enter_smm = svm_pre_enter_smm,
-       .pre_leave_smm = svm_pre_leave_smm,
+       .enter_smm = svm_enter_smm,
+       .leave_smm = svm_leave_smm,
        .enable_smi_window = svm_enable_smi_window,
 
        .mem_enc_op = svm_mem_enc_op,