KVM: x86: Allow CPU to force vendor-specific TDP level
[linux-2.6-microblaze.git] / arch / x86 / kvm / svm / svm.c
index 8834822..bcffae2 100644 (file)
@@ -46,8 +46,6 @@
 #include "kvm_onhyperv.h"
 #include "svm_onhyperv.h"
 
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
-
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
@@ -198,6 +196,11 @@ module_param(avic, bool, 0444);
 bool __read_mostly dump_invalid_vmcb;
 module_param(dump_invalid_vmcb, bool, 0644);
 
+
+bool intercept_smi = true;
+module_param(intercept_smi, bool, 0444);
+
+
 static bool svm_gp_erratum_intercept = true;
 
 static u8 rsm_ins_bytes[] = "\x0f\xaa";
@@ -1010,7 +1013,9 @@ static __init int svm_hardware_setup(void)
        if (!boot_cpu_has(X86_FEATURE_NPT))
                npt_enabled = false;
 
-       kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
+       /* Force VM NPT level equal to the host's max NPT level */
+       kvm_configure_mmu(npt_enabled, get_max_npt_level(),
+                         get_max_npt_level(), PG_LEVEL_1G);
        pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
 
        /* Note, SEV setup consumes npt_enabled. */
@@ -1156,8 +1161,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
        struct vmcb_control_area *control = &svm->vmcb->control;
        struct vmcb_save_area *save = &svm->vmcb->save;
 
-       vcpu->arch.hflags = 0;
-
        svm_set_intercept(svm, INTERCEPT_CR0_READ);
        svm_set_intercept(svm, INTERCEPT_CR3_READ);
        svm_set_intercept(svm, INTERCEPT_CR4_READ);
@@ -1185,7 +1188,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
 
        svm_set_intercept(svm, INTERCEPT_INTR);
        svm_set_intercept(svm, INTERCEPT_NMI);
-       svm_set_intercept(svm, INTERCEPT_SMI);
+
+       if (intercept_smi)
+               svm_set_intercept(svm, INTERCEPT_SMI);
+
        svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
        svm_set_intercept(svm, INTERCEPT_RDPMC);
        svm_set_intercept(svm, INTERCEPT_CPUID);
@@ -1233,29 +1239,14 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
                SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
        save->cs.limit = 0xffff;
 
+       save->gdtr.base = 0;
        save->gdtr.limit = 0xffff;
+       save->idtr.base = 0;
        save->idtr.limit = 0xffff;
 
        init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
        init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 
-       svm_set_cr4(vcpu, 0);
-       svm_set_efer(vcpu, 0);
-       save->dr6 = 0xffff0ff0;
-       kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
-       save->rip = 0x0000fff0;
-       vcpu->arch.regs[VCPU_REGS_RIP] = save->rip;
-
-       /*
-        * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
-        * It also updates the guest-visible cr0 value.
-        */
-       svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
-       kvm_mmu_reset_context(vcpu);
-
-       save->cr4 = X86_CR4_PAE;
-       /* rdx = ?? */
-
        if (npt_enabled) {
                /* Setup VMCB for Nested Paging */
                control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
@@ -1265,14 +1256,12 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
                svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
                save->g_pat = vcpu->arch.pat;
                save->cr3 = 0;
-               save->cr4 = 0;
        }
        svm->current_vmcb->asid_generation = 0;
        svm->asid = 0;
 
        svm->nested.vmcb12_gpa = INVALID_GPA;
        svm->nested.last_vmcb12_gpa = INVALID_GPA;
-       vcpu->arch.hflags = 0;
 
        if (!kvm_pause_in_guest(vcpu->kvm)) {
                control->pause_filter_count = pause_filter_count;
@@ -1322,25 +1311,11 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       u32 dummy;
-       u32 eax = 1;
 
        svm->spec_ctrl = 0;
        svm->virt_spec_ctrl = 0;
 
-       if (!init_event) {
-               vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
-                                      MSR_IA32_APICBASE_ENABLE;
-               if (kvm_vcpu_is_reset_bsp(vcpu))
-                       vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
-       }
        init_vmcb(vcpu);
-
-       kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
-       kvm_rdx_write(vcpu, eax);
-
-       if (kvm_vcpu_apicv_active(vcpu) && !init_event)
-               avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
 }
 
 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
@@ -1398,8 +1373,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
                goto error_free_vmsa_page;
        }
 
-       svm_vcpu_init_msrpm(vcpu, svm->msrpm);
-
        svm->vmcb01.ptr = page_address(vmcb01_page);
        svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
 
@@ -1411,6 +1384,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
        svm_switch_vmcb(svm, &svm->vmcb01);
        init_vmcb(vcpu);
 
+       svm_vcpu_init_msrpm(vcpu, svm->msrpm);
+
        svm_init_osvw(vcpu);
        vcpu->arch.microcode_version = 0x01000065;
 
@@ -1505,12 +1480,15 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                sd->current_vmcb = svm->vmcb;
                indirect_branch_prediction_barrier();
        }
-       avic_vcpu_load(vcpu, cpu);
+       if (kvm_vcpu_apicv_active(vcpu))
+               avic_vcpu_load(vcpu, cpu);
 }
 
 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
 {
-       avic_vcpu_put(vcpu);
+       if (kvm_vcpu_apicv_active(vcpu))
+               avic_vcpu_put(vcpu);
+
        svm_prepare_host_switch(vcpu);
 
        ++vcpu->stat.host_state_reload;
@@ -1552,7 +1530,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
                load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
                break;
        default:
-               WARN_ON_ONCE(1);
+               KVM_BUG_ON(1, vcpu->kvm);
        }
 }
 
@@ -1560,8 +1538,11 @@ static void svm_set_vintr(struct vcpu_svm *svm)
 {
        struct vmcb_control_area *control;
 
-       /* The following fields are ignored when AVIC is enabled */
-       WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu));
+       /*
+        * The following fields are ignored when AVIC is enabled
+        */
+       WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
+
        svm_set_intercept(svm, INTERCEPT_VINTR);
 
        /*
@@ -1923,7 +1904,7 @@ static int npf_interception(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
+       u64 fault_address = svm->vmcb->control.exit_info_2;
        u64 error_code = svm->vmcb->control.exit_info_1;
 
        trace_kvm_page_fault(fault_address, error_code);
@@ -2066,11 +2047,15 @@ static int shutdown_interception(struct kvm_vcpu *vcpu)
                return -EINVAL;
 
        /*
-        * VMCB is undefined after a SHUTDOWN intercept
-        * so reinitialize it.
+        * VMCB is undefined after a SHUTDOWN intercept.  INIT the vCPU to put
+        * the VMCB in a known good state.  Unfortuately, KVM doesn't have
+        * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
+        * userspace.  At a platform view, INIT is acceptable behavior as
+        * there exist bare metal platforms that automatically INIT the CPU
+        * in response to shutdown.
         */
        clear_page(svm->vmcb);
-       init_vmcb(vcpu);
+       kvm_vcpu_reset(vcpu, true);
 
        kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
        return 0;
@@ -2106,6 +2091,11 @@ static int nmi_interception(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int smi_interception(struct kvm_vcpu *vcpu)
+{
+       return 1;
+}
+
 static int intr_interception(struct kvm_vcpu *vcpu)
 {
        ++vcpu->stat.irq_exits;
@@ -2134,11 +2124,12 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload)
        ret = kvm_skip_emulated_instruction(vcpu);
 
        if (vmload) {
-               nested_svm_vmloadsave(vmcb12, svm->vmcb);
+               svm_copy_vmloadsave_state(svm->vmcb, vmcb12);
                svm->sysenter_eip_hi = 0;
                svm->sysenter_esp_hi = 0;
-       } else
-               nested_svm_vmloadsave(svm->vmcb, vmcb12);
+       } else {
+               svm_copy_vmloadsave_state(vmcb12, svm->vmcb);
+       }
 
        kvm_vcpu_unmap(vcpu, &map, true);
 
@@ -2941,7 +2932,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                        svm_disable_lbrv(vcpu);
                break;
        case MSR_VM_HSAVE_PA:
-               svm->nested.hsave_msr = data;
+               /*
+                * Old kernels did not validate the value written to
+                * MSR_VM_HSAVE_PA.  Allow KVM_SET_MSR to set an invalid
+                * value to allow live migrating buggy or malicious guests
+                * originating from those kernels.
+                */
+               if (!msr->host_initiated && !page_address_valid(vcpu, data))
+                       return 1;
+
+               svm->nested.hsave_msr = data & PAGE_MASK;
                break;
        case MSR_VM_CR:
                return svm_set_vm_cr(vcpu, data);
@@ -2966,10 +2966,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
                svm->msr_decfg = data;
                break;
        }
-       case MSR_IA32_APICBASE:
-               if (kvm_vcpu_apicv_active(vcpu))
-                       avic_update_vapic_bar(to_svm(vcpu), data);
-               fallthrough;
        default:
                return kvm_set_msr_common(vcpu, msr);
        }
@@ -2994,7 +2990,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu)
         * In this case AVIC was temporarily disabled for
         * requesting the IRQ window and we have to re-enable it.
         */
-       svm_toggle_avic_for_irq_window(vcpu, true);
+       kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN);
 
        ++vcpu->stat.irq_window_exits;
        return 1;
@@ -3080,8 +3076,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
        [SVM_EXIT_INTR]                         = intr_interception,
        [SVM_EXIT_NMI]                          = nmi_interception,
-       [SVM_EXIT_SMI]                          = kvm_emulate_as_nop,
-       [SVM_EXIT_INIT]                         = kvm_emulate_as_nop,
+       [SVM_EXIT_SMI]                          = smi_interception,
        [SVM_EXIT_VINTR]                        = interrupt_window_interception,
        [SVM_EXIT_RDPMC]                        = kvm_emulate_rdpmc,
        [SVM_EXIT_CPUID]                        = kvm_emulate_cpuid,
@@ -3243,12 +3238,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
               "excp_to:", save->last_excp_to);
 }
 
-static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code)
 {
-       if (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
-           svm_exit_handlers[exit_code])
-               return 0;
+       return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
+               svm_exit_handlers[exit_code]);
+}
 
+static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+{
        vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
        dump_vmcb(vcpu);
        vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -3256,14 +3253,13 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
        vcpu->run->internal.ndata = 2;
        vcpu->run->internal.data[0] = exit_code;
        vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
-
-       return -EINVAL;
+       return 0;
 }
 
 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
 {
-       if (svm_handle_invalid_exit(vcpu, exit_code))
-               return 0;
+       if (!svm_check_exit_valid(vcpu, exit_code))
+               return svm_handle_invalid_exit(vcpu, exit_code);
 
 #ifdef CONFIG_RETPOLINE
        if (exit_code == SVM_EXIT_MSR)
@@ -3547,7 +3543,7 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
                 * via AVIC. In such case, we need to temporarily disable AVIC,
                 * and fallback to injecting IRQ via V_IRQ.
                 */
-               svm_toggle_avic_for_irq_window(vcpu, false);
+               kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN);
                svm_set_vintr(svm);
        }
 }
@@ -3782,6 +3778,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
 
        pre_svm_run(vcpu);
 
+       WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
+
        sync_lapic_to_cr8(vcpu);
 
        if (unlikely(svm->asid != svm->vmcb->control.asid)) {
@@ -4288,6 +4286,7 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
+       struct kvm_host_map map_save;
        int ret;
 
        if (is_guest_mode(vcpu)) {
@@ -4303,6 +4302,29 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
                ret = nested_svm_vmexit(svm);
                if (ret)
                        return ret;
+
+               /*
+                * KVM uses VMCB01 to store L1 host state while L2 runs but
+                * VMCB01 is going to be used during SMM and thus the state will
+                * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
+                * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
+                * format of the area is identical to guest save area offsetted
+                * by 0x400 (matches the offset of 'struct vmcb_save_area'
+                * within 'struct vmcb'). Note: HSAVE area may also be used by
+                * L1 hypervisor to save additional host context (e.g. KVM does
+                * that, see svm_prepare_guest_switch()) which must be
+                * preserved.
+                */
+               if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
+                                &map_save) == -EINVAL)
+                       return 1;
+
+               BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
+
+               svm_copy_vmrun_state(map_save.hva + 0x400,
+                                    &svm->vmcb01.ptr->save);
+
+               kvm_vcpu_unmap(vcpu, &map_save, true);
        }
        return 0;
 }
@@ -4310,13 +4332,14 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
 static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       struct kvm_host_map map;
+       struct kvm_host_map map, map_save;
        int ret = 0;
 
        if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
                u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
                u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8);
                u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
+               struct vmcb *vmcb12;
 
                if (guest) {
                        if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
@@ -4332,8 +4355,25 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
                        if (svm_allocate_nested(svm))
                                return 1;
 
-                       ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, map.hva);
+                       vmcb12 = map.hva;
+
+                       nested_load_control_from_vmcb12(svm, &vmcb12->control);
+
+                       ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12);
                        kvm_vcpu_unmap(vcpu, &map, true);
+
+                       /*
+                        * Restore L1 host state from L1 HSAVE area as VMCB01 was
+                        * used during SMM (see svm_enter_smm())
+                        */
+                       if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
+                                        &map_save) == -EINVAL)
+                               return 1;
+
+                       svm_copy_vmrun_state(&svm->vmcb01.ptr->save,
+                                            map_save.hva + 0x400);
+
+                       kvm_vcpu_unmap(vcpu, &map_save, true);
                }
        }
 
@@ -4542,7 +4582,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
        .set_virtual_apic_mode = svm_set_virtual_apic_mode,
        .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
        .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
-       .pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl,
        .load_eoi_exitmap = svm_load_eoi_exitmap,
        .hwapic_irr_update = svm_hwapic_irr_update,
        .hwapic_isr_update = svm_hwapic_isr_update,