#define MSR_BITMAP_MODE_X2APIC 1
#define MSR_BITMAP_MODE_X2APIC_APICV 2
-#define MSR_BITMAP_MODE_LM 4
#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
#endif
-#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
+#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
#define KVM_VM_CR0_ALWAYS_ON \
(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
int cpu;
bool launched;
bool nmi_known_unmasked;
+ bool hv_timer_armed;
/* Support for vnmi-less CPUs */
int soft_vnmi_blocked;
ktime_t entry_time;
/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
u64 vmcs01_debugctl;
+ u64 vmcs01_guest_bndcfgs;
u16 vpid02;
u16 last_vpid;
int ple_window;
bool ple_window_dirty;
+ bool req_immediate_exit;
+
/* Support for PML */
#define PML_ENTITY_NUM 512
struct page *pml_pg;
return is_exception_n(intr_info, PF_VECTOR);
}
-static inline bool is_no_device(u32 intr_info)
-{
- return is_exception_n(intr_info, NM_VECTOR);
-}
-
static inline bool is_invalid_opcode(u32 intr_info)
{
return is_exception_n(intr_info, UD_VECTOR);
return is_exception_n(intr_info, GP_VECTOR);
}
-static inline bool is_external_interrupt(u32 intr_info)
-{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
- == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
-}
-
static inline bool is_machine_check(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
u16 fs_sel, gs_sel;
int i;
+ vmx->req_immediate_exit = false;
+
if (vmx->loaded_cpu_state)
return;
vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
}
- if (is_long_mode(&vmx->vcpu))
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#else
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
vmx->loaded_cpu_state = NULL;
#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu))
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
kvm_load_ldt(host_state->ldt_sel);
#ifdef CONFIG_X86_64
static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
{
- if (is_long_mode(&vmx->vcpu)) {
- preempt_disable();
- if (vmx->loaded_cpu_state)
- rdmsrl(MSR_KERNEL_GS_BASE,
- vmx->msr_guest_kernel_gs_base);
- preempt_enable();
- }
+ preempt_disable();
+ if (vmx->loaded_cpu_state)
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ preempt_enable();
return vmx->msr_guest_kernel_gs_base;
}
static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
{
- if (is_long_mode(&vmx->vcpu)) {
- preempt_disable();
- if (vmx->loaded_cpu_state)
- wrmsrl(MSR_KERNEL_GS_BASE, data);
- preempt_enable();
- }
+ preempt_disable();
+ if (vmx->loaded_cpu_state)
+ wrmsrl(MSR_KERNEL_GS_BASE, data);
+ preempt_enable();
vmx->msr_guest_kernel_gs_base = data;
}
#endif
}
} else {
if (vmcs12->exception_bitmap & (1u << nr)) {
- if (nr == DB_VECTOR)
+ if (nr == DB_VECTOR) {
*exit_qual = vcpu->arch.dr6;
- else
+ *exit_qual &= ~(DR6_FIXED_1 | DR6_BT);
+ *exit_qual ^= DR6_RTM;
+ } else {
*exit_qual = 0;
+ }
return 1;
}
}
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
- if (kvm_mpx_supported())
- msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
-
/* We support free control of debug control saving. */
msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
VM_ENTRY_LOAD_IA32_PAT;
msrs->entry_ctls_high |=
(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
- if (kvm_mpx_supported())
- msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
/* We support free control of debug control loading. */
msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
msrs->secondary_ctls_high);
msrs->secondary_ctls_low = 0;
msrs->secondary_ctls_high &=
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_DESC |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING;
+
/*
* We can emulate "VMCS shadowing," even if the hardware
* doesn't support it.
msrs->secondary_ctls_high |=
SECONDARY_EXEC_UNRESTRICTED_GUEST;
+ if (flexpriority_enabled)
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
/* miscellaneous data */
rdmsr(MSR_IA32_VMX_MISC,
msrs->misc_low,
if (!msr)
return;
- /*
- * MSR_KERNEL_GS_BASE is not intercepted when the guest is in
- * 64-bit mode as a 64-bit kernel may frequently access the
- * MSR. This means we need to manually save/restore the MSR
- * when switching between guest and host state, but only if
- * the guest is in 64-bit mode. Sync our cached value if the
- * guest is transitioning to 32-bit mode and the CPU contains
- * guest state, i.e. the cache is stale.
- */
-#ifdef CONFIG_X86_64
- if (!(efer & EFER_LMA))
- (void)vmx_read_guest_kernel_gs_base(vmx);
-#endif
vcpu->arch.efer = efer;
if (efer & EFER_LMA) {
vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long hw_cr0;
- hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
+ hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
if (enable_unrestricted_guest)
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
else {
* To use VMXON (and later other VMX instructions), a guest
* must first be able to turn on cr4.VMXE (see handle_vmon()).
* So basically the check on whether to allow nested VMX
- * is here.
+ * is here. We operate under the default treatment of SMM,
+ * so VMX cannot be enabled under SMM.
*/
- if (!nested_vmx_allowed(vcpu))
+ if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
return 1;
}
mode |= MSR_BITMAP_MODE_X2APIC_APICV;
}
- if (is_long_mode(vcpu))
- mode |= MSR_BITMAP_MODE_LM;
-
return mode;
}
if (!changed)
return;
- vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
- !(mode & MSR_BITMAP_MODE_LM));
-
if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
nested_mark_vmcs12_pages_dirty(vcpu);
}
+static u8 vmx_get_rvi(void)
+{
+ return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
+}
+
+static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ void *vapic_page;
+ u32 vppr;
+ int rvi;
+
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
+ !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
+ WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
+ return false;
+
+ rvi = vmx_get_rvi();
+
+ vapic_page = kmap(vmx->nested.virtual_apic_page);
+ vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
+ kunmap(vmx->nested.virtual_apic_page);
+
+ return ((rvi & 0xf0) > (vppr & 0xf0));
+}
+
static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
bool nested)
{
kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
}
+ if (!cpu_has_vmx_preemption_timer())
+ kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
+
if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
u64 vmx_msr;
return kvm_skip_emulated_instruction(vcpu);
}
+static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
+}
+
static int handle_invvpid(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 vpid;
u64 gla;
} operand;
+ u16 vpid02;
if (!(vmx->nested.msrs.secondary_ctls_high &
SECONDARY_EXEC_ENABLE_VPID) ||
return kvm_skip_emulated_instruction(vcpu);
}
+ vpid02 = nested_get_vpid02(vcpu);
switch (type) {
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
if (!operand.vpid ||
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
return kvm_skip_emulated_instruction(vcpu);
}
- if (cpu_has_vmx_invvpid_individual_addr() &&
- vmx->nested.vpid02) {
+ if (cpu_has_vmx_invvpid_individual_addr()) {
__invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
- vmx->nested.vpid02, operand.gla);
+ vpid02, operand.gla);
} else
- __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ __vmx_flush_tlb(vcpu, vpid02, false);
break;
case VMX_VPID_EXTENT_SINGLE_CONTEXT:
case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
return kvm_skip_emulated_instruction(vcpu);
}
- __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ __vmx_flush_tlb(vcpu, vpid02, false);
break;
case VMX_VPID_EXTENT_ALL_CONTEXT:
- __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ __vmx_flush_tlb(vcpu, vpid02, false);
break;
default:
WARN_ON_ONCE(1);
static int handle_preemption_timer(struct kvm_vcpu *vcpu)
{
- kvm_lapic_expired_hv_timer(vcpu);
+ if (!to_vmx(vcpu)->req_immediate_exit)
+ kvm_lapic_expired_hv_timer(vcpu);
return 1;
}
return false;
else if (is_page_fault(intr_info))
return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
- else if (is_no_device(intr_info) &&
- !(vmcs12->guest_cr0 & X86_CR0_TS))
- return false;
else if (is_debug(intr_info) &&
vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
if (!lapic_in_kernel(vcpu))
return;
+ if (!flexpriority_enabled &&
+ !cpu_has_vmx_virtualize_x2apic_mode())
+ return;
+
/* Postpone execution until vmcs01 is the current VMCS. */
if (is_guest_mode(vcpu)) {
to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true;
return;
}
- if (!cpu_need_tpr_shadow(vcpu))
- return;
-
sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
return max_irr;
}
+static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
+{
+ u8 rvi = vmx_get_rvi();
+ u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
+
+ return ((rvi & 0xf0) > (vppr & 0xf0));
+}
+
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
if (!kvm_vcpu_apicv_active(vcpu))
msrs[i].host, false);
}
-static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val)
+{
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val);
+ if (!vmx->loaded_vmcs->hv_timer_armed)
+ vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ vmx->loaded_vmcs->hv_timer_armed = true;
+}
+
+static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 tscl;
u32 delta_tsc;
- if (vmx->hv_deadline_tsc == -1)
+ if (vmx->req_immediate_exit) {
+ vmx_arm_hv_timer(vmx, 0);
return;
+ }
- tscl = rdtsc();
- if (vmx->hv_deadline_tsc > tscl)
- /* sure to be 32 bit only because checked on set_hv_timer */
- delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
- cpu_preemption_timer_multi);
- else
- delta_tsc = 0;
+ if (vmx->hv_deadline_tsc != -1) {
+ tscl = rdtsc();
+ if (vmx->hv_deadline_tsc > tscl)
+ /* set_hv_timer ensures the delta fits in 32-bits */
+ delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+ cpu_preemption_timer_multi);
+ else
+ delta_tsc = 0;
- vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+ vmx_arm_hv_timer(vmx, delta_tsc);
+ return;
+ }
+
+ if (vmx->loaded_vmcs->hv_timer_armed)
+ vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ vmx->loaded_vmcs->hv_timer_armed = false;
}
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
atomic_switch_perf_msrs(vmx);
- vmx_arm_hv_timer(vcpu);
+ vmx_update_hv_timer(vcpu);
/*
* If this vCPU has touched SPEC_CTRL, restore the guest's value if
vmx->loaded_vmcs = vmcs;
vmx_vcpu_load(vcpu, cpu);
put_cpu();
+
+ vm_entry_controls_reset_shadow(vmx);
+ vm_exit_controls_reset_shadow(vmx);
+ vmx_segment_cache_clear(vmx);
}
/*
#undef cr4_fixed1_update
}
+static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (kvm_mpx_supported()) {
+ bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
+
+ if (mpx_enabled) {
+ vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+ vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+ } else {
+ vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
+ vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
+ }
+ }
+}
+
static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
- if (nested_vmx_allowed(vcpu))
+ if (nested_vmx_allowed(vcpu)) {
nested_vmx_cr_fixed1_bits_update(vcpu);
+ nested_vmx_entry_exit_ctls_update(vcpu);
+ }
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
return get_vmcs12(vcpu)->ept_pointer;
}
-static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
{
WARN_ON(mmu_is_nested(vcpu));
- if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
- return 1;
kvm_init_shadow_ept_mmu(vcpu,
to_vmx(vcpu)->nested.msrs.ept_caps &
vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
- return 0;
}
static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (vcpu->arch.virtual_tsc_khz == 0)
- return;
-
- /* Make sure short timeouts reliably trigger an immediate vmexit.
- * hrtimer_start does not guarantee this. */
- if (preemption_timeout <= 1) {
+ /*
+ * A timer value of zero is architecturally guaranteed to cause
+ * a VMExit prior to executing any instructions in the guest.
+ */
+ if (preemption_timeout == 0) {
vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
return;
}
+ if (vcpu->arch.virtual_tsc_khz == 0)
+ return;
+
preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
preemption_timeout *= 1000000;
do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
* bits 15:8 should be zero in posted_intr_nv,
* the descriptor address has been already checked
* in nested_get_vmcs12_pages.
+ *
+ * bits 5:0 of posted_intr_desc_addr should be zero.
*/
if (nested_cpu_has_posted_intr(vmcs12) &&
(!nested_cpu_has_vid(vmcs12) ||
!nested_exit_intr_ack_set(vcpu) ||
- vmcs12->posted_intr_nv & 0xff00))
+ (vmcs12->posted_intr_nv & 0xff00) ||
+ (vmcs12->posted_intr_desc_addr & 0x3f) ||
+ (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr))))
return -EINVAL;
/* tpr shadow is needed by all apicv features. */
static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- u64 address = vmcs12->pml_address;
- int maxphyaddr = cpuid_maxphyaddr(vcpu);
+ if (!nested_cpu_has_pml(vmcs12))
+ return 0;
- if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) {
- if (!nested_cpu_has_ept(vmcs12) ||
- !IS_ALIGNED(address, 4096) ||
- address >> maxphyaddr)
- return -EINVAL;
- }
+ if (!nested_cpu_has_ept(vmcs12) ||
+ !page_address_valid(vcpu, vmcs12->pml_address))
+ return -EINVAL;
return 0;
}
return 0;
}
+/*
+ * Returns if KVM is able to config CPU to tag TLB entries
+ * populated by L2 differently than TLB entries populated
+ * by L1.
+ *
+ * If L1 uses EPT, then TLB entries are tagged with different EPTP.
+ *
+ * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
+ * with different VPID (L1 entries are tagged with vmx->vpid
+ * while L2 entries are tagged with vmx->nested.vpid02).
+ */
+static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ return nested_cpu_has_ept(vmcs12) ||
+ (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
+}
+
static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
set_cr4_guest_host_mask(vmx);
- if (vmx_mpx_supported())
- vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+ if (kvm_mpx_supported()) {
+ if (vmx->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+ vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
+ else
+ vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
+ }
if (enable_vpid) {
if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
exec_control = vmcs12->pin_based_vm_exec_control;
- /* Preemption timer setting is only taken from vmcs01. */
- exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ /* Preemption timer setting is computed directly in vmx_vcpu_run. */
exec_control |= vmcs_config.pin_based_exec_ctrl;
- if (vmx->hv_deadline_tsc == -1)
- exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ vmx->loaded_vmcs->hv_timer_armed = false;
/* Posted interrupts setting is only taken from vmcs12. */
if (nested_cpu_has_posted_intr(vmcs12)) {
* we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
* bits are further modified by vmx_set_efer() below.
*/
- vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+ vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
* emulated by vmx_set_efer(), below.
* influence global bitmap(for vpid01 and vpid02 allocation)
* even if spawn a lot of nested vCPUs.
*/
- if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
+ if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
vmx->nested.last_vpid = vmcs12->virtual_processor_id;
- __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true);
+ __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
}
} else {
- vmx_flush_tlb(vcpu, true);
+ /*
+ * If L1 use EPT, then L0 needs to execute INVEPT on
+ * EPTP02 instead of EPTP01. Therefore, delay TLB
+ * flush until vmcs02->eptp is fully updated by
+ * KVM_REQ_LOAD_CR3. Note that this assumes
+ * KVM_REQ_TLB_FLUSH is evaluated after
+ * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
}
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
}
- if (nested_cpu_has_ept(vmcs12)) {
- if (nested_ept_init_mmu_context(vcpu)) {
- *entry_failure_code = ENTRY_FAIL_DEFAULT;
- return 1;
- }
- } else if (nested_cpu_has2(vmcs12,
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ if (nested_cpu_has_ept(vmcs12))
+ nested_ept_init_mmu_context(vcpu);
+ else if (nested_cpu_has2(vmcs12,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
vmx_flush_tlb(vcpu, true);
- }
/*
* This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool ia32e;
if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+ if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12))
return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
!nested_cr3_valid(vcpu, vmcs12->host_cr3))
return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
+ /*
+ * If the load IA32_EFER VM-exit control is 1, bits reserved in the
+ * IA32_EFER MSR must be 0 in the field for that register. In addition,
+ * the values of the LMA and LME bits in the field must each be that of
+ * the host address-space size VM-exit control.
+ */
+ if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
+ ia32e = (vmcs12->vm_exit_controls &
+ VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
+ if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
+ ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
+ ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
+ return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
+ }
+
/*
* From the Intel SDM, volume 3:
* Fields relevant to VM-entry event injection must be set properly.
}
}
+ if (nested_cpu_has_ept(vmcs12) &&
+ !valid_ept_address(vcpu, vmcs12->ept_pointer))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
+
return 0;
}
return 1;
}
- /*
- * If the load IA32_EFER VM-exit control is 1, bits reserved in the
- * IA32_EFER MSR must be 0 in the field for that register. In addition,
- * the values of the LMA and LME bits in the field must each be that of
- * the host address-space size VM-exit control.
- */
- if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
- ia32e = (vmcs12->vm_exit_controls &
- VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
- if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
- ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
- ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
- return 1;
- }
-
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
(vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
bool from_vmentry = !!exit_qual;
u32 dummy_exit_qual;
- u32 vmcs01_cpu_exec_ctrl;
+ bool evaluate_pending_interrupts;
int r = 0;
- vmcs01_cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
+ (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
+ if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
+ evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
enter_guest_mode(vcpu);
if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ if (kvm_mpx_supported() &&
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+ vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
- vmx_segment_cache_clear(vmx);
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
vcpu->arch.tsc_offset += vmcs12->tsc_offset;
* to L1 or delivered directly to L2 (e.g. In case L1 don't
* intercept EXTERNAL_INTERRUPT).
*
- * Usually this would be handled by L0 requesting a
- * IRQ/NMI window by setting VMCS accordingly. However,
- * this setting was done on VMCS01 and now VMCS02 is active
- * instead. Thus, we force L0 to perform pending event
- * evaluation by requesting a KVM_REQ_EVENT.
- */
- if (vmcs01_cpu_exec_ctrl &
- (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING)) {
+ * Usually this would be handled by the processor noticing an
+ * IRQ/NMI window request, or checking RVI during evaluation of
+ * pending virtual interrupts. However, this setting was done
+ * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
+ * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
+ */
+ if (unlikely(evaluate_pending_interrupts))
kvm_make_request(KVM_REQ_EVENT, vcpu);
- }
/*
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
return 0;
}
+static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
+{
+ to_vmx(vcpu)->req_immediate_exit = true;
+}
+
static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
{
ktime_t remaining =
kvm_clear_interrupt_queue(vcpu);
}
-static void load_vmcs12_mmu_host_state(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
-{
- u32 entry_failure_code;
-
- nested_ept_uninit_mmu_context(vcpu);
-
- /*
- * Only PDPTE load can fail as the value of cr3 was checked on entry and
- * couldn't have changed.
- */
- if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
- nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
-
- if (!enable_ept)
- vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
-}
-
/*
* A part of what we need to when the nested L2 guest exits and we want to
* run its L1 parent, is to reset L1's guest state to the host state specified
struct vmcs12 *vmcs12)
{
struct kvm_segment seg;
+ u32 entry_failure_code;
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
vcpu->arch.efer = vmcs12->host_ia32_efer;
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
vmx_set_cr4(vcpu, vmcs12->host_cr4);
- load_vmcs12_mmu_host_state(vcpu, vmcs12);
+ nested_ept_uninit_mmu_context(vcpu);
/*
- * If vmcs01 don't use VPID, CPU flushes TLB on every
+ * Only PDPTE load can fail as the value of cr3 was checked on entry and
+ * couldn't have changed.
+ */
+ if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
+
+ if (!enable_ept)
+ vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+
+ /*
+ * If vmcs01 doesn't use VPID, CPU flushes TLB on every
* VMEntry/VMExit. Thus, no need to flush TLB.
*
- * If vmcs12 uses VPID, TLB entries populated by L2 are
- * tagged with vmx->nested.vpid02 while L1 entries are tagged
- * with vmx->vpid. Thus, no need to flush TLB.
+ * If vmcs12 doesn't use VPID, L1 expects TLB to be
+ * flushed on every VMEntry/VMExit.
*
- * Therefore, flush TLB only in case vmcs01 uses VPID and
- * vmcs12 don't use VPID as in this case L1 & L2 TLB entries
- * are both tagged with vmx->vpid.
+ * Otherwise, we can preserve TLB entries as long as we are
+ * able to tag L1 TLB entries differently than L2 TLB entries.
+ *
+ * If vmcs12 uses EPT, we need to execute this flush on EPTP01
+ * and therefore we request the TLB flush to happen only after VMCS EPTP
+ * has been set by KVM_REQ_LOAD_CR3.
*/
if (enable_vpid &&
- !(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02)) {
- vmx_flush_tlb(vcpu, true);
+ (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
}
+static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
+{
+ struct shared_msr_entry *efer_msr;
+ unsigned int i;
+
+ if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
+ return vmcs_read64(GUEST_IA32_EFER);
+
+ if (cpu_has_load_ia32_efer)
+ return host_efer;
+
+ for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
+ if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
+ return vmx->msr_autoload.guest.val[i].value;
+ }
+
+ efer_msr = find_msr_entry(vmx, MSR_EFER);
+ if (efer_msr)
+ return efer_msr->data;
+
+ return host_efer;
+}
+
+static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmx_msr_entry g, h;
+ struct msr_data msr;
+ gpa_t gpa;
+ u32 i, j;
+
+ vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
+
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+ /*
+ * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
+ * as vmcs01.GUEST_DR7 contains a userspace defined value
+ * and vcpu->arch.dr7 is not squirreled away before the
+ * nested VMENTER (not worth adding a variable in nested_vmx).
+ */
+ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
+ kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+ else
+ WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
+ }
+
+ /*
+ * Note that calling vmx_set_{efer,cr0,cr4} is important as they
+ * handle a variety of side effects to KVM's software model.
+ */
+ vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
+
+ vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
+
+ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+ vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
+
+ nested_ept_uninit_mmu_context(vcpu);
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+
+ /*
+ * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
+ * from vmcs01 (if necessary). The PDPTRs are not loaded on
+ * VMFail, like everything else we just need to ensure our
+ * software model is up-to-date.
+ */
+ ept_save_pdptrs(vcpu);
+
+ kvm_mmu_reset_context(vcpu);
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmx_update_msr_bitmap(vcpu);
+
+ /*
+ * This nasty bit of open coding is a compromise between blindly
+ * loading L1's MSRs using the exit load lists (incorrect emulation
+ * of VMFail), leaving the nested VM's MSRs in the software model
+ * (incorrect behavior) and snapshotting the modified MSRs (too
+ * expensive since the lists are unbound by hardware). For each
+ * MSR that was (prematurely) loaded from the nested VMEntry load
+ * list, reload it from the exit load list if it exists and differs
+ * from the guest value. The intent is to stuff host state as
+ * silently as possible, not to fully process the exit load list.
+ */
+ msr.host_initiated = false;
+ for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
+ gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
+ if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
+ pr_debug_ratelimited(
+ "%s read MSR index failed (%u, 0x%08llx)\n",
+ __func__, i, gpa);
+ goto vmabort;
+ }
+
+ for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
+ gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
+ if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
+ pr_debug_ratelimited(
+ "%s read MSR failed (%u, 0x%08llx)\n",
+ __func__, j, gpa);
+ goto vmabort;
+ }
+ if (h.index != g.index)
+ continue;
+ if (h.value == g.value)
+ break;
+
+ if (nested_vmx_load_msr_check(vcpu, &h)) {
+ pr_debug_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, j, h.index, h.reserved);
+ goto vmabort;
+ }
+
+ msr.index = h.index;
+ msr.data = h.value;
+ if (kvm_set_msr(vcpu, &msr)) {
+ pr_debug_ratelimited(
+ "%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
+ __func__, j, h.index, h.value);
+ goto vmabort;
+ }
+ }
+ }
+
+ return;
+
+vmabort:
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
+}
+
/*
* Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
* and modify vmcs12 to make it see what it would expect to see there if
}
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- vm_entry_controls_reset_shadow(vmx);
- vm_exit_controls_reset_shadow(vmx);
- vmx_segment_cache_clear(vmx);
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
- if (vmx->hv_deadline_tsc == -1)
- vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
- PIN_BASED_VMX_PREEMPTION_TIMER);
- else
- vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
- PIN_BASED_VMX_PREEMPTION_TIMER);
+
if (kvm_has_tsc_control)
decache_tsc_multiplier(vmx);
*/
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- load_vmcs12_mmu_host_state(vcpu, vmcs12);
+ /*
+ * Restore L1's host state to KVM's software model. We're here
+ * because a consistency check was caught by hardware, which
+ * means some amount of guest state has been propagated to KVM's
+ * model and needs to be unwound to the host's state.
+ */
+ nested_vmx_restore_host_state(vcpu);
/*
* The emulated instruction was already skipped in
return -ERANGE;
vmx->hv_deadline_tsc = tscl + delta_tsc;
- vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
- PIN_BASED_VMX_PREEMPTION_TIMER);
-
return delta_tsc == 0;
}
static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- vmx->hv_deadline_tsc = -1;
- vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
- PIN_BASED_VMX_PREEMPTION_TIMER);
+ to_vmx(vcpu)->hv_deadline_tsc = -1;
}
#endif
~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
return -EINVAL;
+ /*
+ * SMM temporarily disables VMX, so we cannot be in guest mode,
+ * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
+ * must be zero.
+ */
+ if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags)
+ return -EINVAL;
+
if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
!(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
return -EINVAL;
.apicv_post_state_restore = vmx_apicv_post_state_restore,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
+ .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
.umip_emulated = vmx_umip_emulated,
.check_nested_events = vmx_check_nested_events,
+ .request_immediate_exit = vmx_request_immediate_exit,
.sched_in = vmx_sched_in,