KVM: nVMX: Reword comments about generating nested CR0/4 read shadows

[linux-2.6-microblaze.git] / arch / x86 / kvm / vmx / nested.c
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c

index 0c62352..b6f4411 100644 (file)
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -7,7 +7,6 @@
  #include <asm/mmu_context.h>
  
  #include "cpuid.h"
-#include "evmcs.h"
  #include "hyperv.h"
  #include "mmu.h"
  #include "nested.h"
@@ -16,6 +15,7 @@
  #include "trace.h"
  #include "vmx.h"
  #include "x86.h"
+#include "smm.h"
  
  static bool __read_mostly enable_shadow_vmcs = 1;
  module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
@@ -225,6 +225,7 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
  
  static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
  {
+       struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
         if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
@@ -233,6 +234,12 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
         }
  
         vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
+
+       if (hv_vcpu) {
+               hv_vcpu->nested.pa_page_gpa = INVALID_GPA;
+               hv_vcpu->nested.vm_id = 0;
+               hv_vcpu->nested.vp_id = 0;
+       }
  }
  
  static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
@@ -1125,6 +1132,15 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
  
+       /*
+        * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or
+        * L2's VP_ID upon request from the guest. Make sure we check for
+        * pending entries in the right FIFO upon L1/L2 transition as these
+        * requests are put by other vCPUs asynchronously.
+        */
+       if (to_hv_vcpu(vcpu) && enable_ept)
+               kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
+
         /*
          * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
          * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
@@ -1557,11 +1573,19 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields
  {
         struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
         struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+       struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu);
  
         /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
         vmcs12->tpr_threshold = evmcs->tpr_threshold;
         vmcs12->guest_rip = evmcs->guest_rip;
  
+       if (unlikely(!(hv_clean_fields &
+                      HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) {
+               hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page;
+               hv_vcpu->nested.vm_id = evmcs->hv_vm_id;
+               hv_vcpu->nested.vp_id = evmcs->hv_vp_id;
+       }
+
         if (unlikely(!(hv_clean_fields &
                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
                 vmcs12->guest_rsp = evmcs->guest_rsp;
@@ -1977,7 +2001,8 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
         if (likely(!guest_cpuid_has_evmcs(vcpu)))
                 return EVMPTRLD_DISABLED;
  
-       if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
+       evmcs_gpa = nested_get_evmptr(vcpu);
+       if (!evmptr_is_valid(evmcs_gpa)) {
                 nested_release_evmcs(vcpu);
                 return EVMPTRLD_DISABLED;
         }
@@ -2563,12 +2588,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                 nested_ept_init_mmu_context(vcpu);
  
         /*
-        * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
-        * bits which we consider mandatory enabled.
-        * The CR0_READ_SHADOW is what L2 should have expected to read given
-        * the specifications by L1; It's not enough to take
-        * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we
-        * have more bits than L1 expected.
+        * Override the CR0/CR4 read shadows after setting the effective guest
+        * CR0/CR4.  The common helpers also set the shadows, but they don't
+        * account for vmcs12's cr0/4_guest_host_mask.
          */
         vmx_set_cr0(vcpu, vmcs12->guest_cr0);
         vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
@@ -3251,6 +3273,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
  
  static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
  {
+       /*
+        * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy
+        * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory
+        * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post
+        * migration.
+        */
         if (!nested_get_evmcs_page(vcpu)) {
                 pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
                                      __func__);
@@ -4767,6 +4795,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
  
         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
  
+       /*
+        * If IBRS is advertised to the vCPU, KVM must flush the indirect
+        * branch predictors when transitioning from L2 to L1, as L1 expects
+        * hardware (KVM in this case) to provide separate predictor modes.
+        * Bare metal isolates VMX root (host) from VMX non-root (guest), but
+        * doesn't isolate different VMCSs, i.e. in this case, doesn't provide
+        * separate modes for L2 vs L1.
+        */
+       if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+               indirect_branch_prediction_barrier();
+
         /* Update any VMCS fields that might have changed while L2 ran */
         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
@@ -4854,6 +4893,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
  
  static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
  {
+       kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
         nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
  }
  
@@ -5099,24 +5139,35 @@ static int handle_vmxon(struct kvm_vcpu *vcpu)
                 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
  
         /*
-        * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks
-        * that have higher priority than VM-Exit (see Intel SDM's pseudocode
-        * for VMXON), as KVM must load valid CR0/CR4 values into hardware while
-        * running the guest, i.e. KVM needs to check the _guest_ values.
+        * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter
+        * the guest and so cannot rely on hardware to perform the check,
+        * which has higher priority than VM-Exit (see Intel SDM's pseudocode
+        * for VMXON).
          *
-        * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and
-        * !COMPATIBILITY modes.  KVM may run the guest in VM86 to emulate Real
-        * Mode, but KVM will never take the guest out of those modes.
+        * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86
+        * and !COMPATIBILITY modes.  For an unrestricted guest, KVM doesn't
+        * force any of the relevant guest state.  For a restricted guest, KVM
+        * does force CR0.PE=1, but only to also force VM86 in order to emulate
+        * Real Mode, and so there's no need to check CR0.PE manually.
          */
-       if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
-           !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
+       if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
                 kvm_queue_exception(vcpu, UD_VECTOR);
                 return 1;
         }
  
         /*
-        * CPL=0 and all other checks that are lower priority than VM-Exit must
-        * be checked manually.
+        * The CPL is checked for "not in VMX operation" and for "in VMX root",
+        * and has higher priority than the VM-Fail due to being post-VMXON,
+        * i.e. VMXON #GPs outside of VMX non-root if CPL!=0.  In VMX non-root,
+        * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits
+        * from L2 to L1, i.e. there's no need to check for the vCPU being in
+        * VMX non-root.
+        *
+        * Forwarding the VM-Exit unconditionally, i.e. without performing the
+        * #UD checks (see above), is functionally ok because KVM doesn't allow
+        * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's
+        * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are
+        * missed by hardware due to shadowing CR0 and/or CR4.
          */
         if (vmx_get_cpl(vcpu)) {
                 kvm_inject_gp(vcpu, 0);
@@ -5126,6 +5177,17 @@ static int handle_vmxon(struct kvm_vcpu *vcpu)
         if (vmx->nested.vmxon)
                 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
  
+       /*
+        * Invalid CR0/CR4 generates #GP.  These checks are performed if and
+        * only if the vCPU isn't already in VMX operation, i.e. effectively
+        * have lower priority than the VM-Fail above.
+        */
+       if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
+           !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
         if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
                         != VMXON_NEEDED_FEATURES) {
                 kvm_inject_gp(vcpu, 0);
@@ -5205,7 +5267,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
         struct vcpu_vmx *vmx = to_vmx(vcpu);
         u32 zero = 0;
         gpa_t vmptr;
-       u64 evmcs_gpa;
         int r;
  
         if (!nested_vmx_check_permission(vcpu))
@@ -5231,7 +5292,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
          * vmx->nested.hv_evmcs but this shouldn't be a problem.
          */
         if (likely(!guest_cpuid_has_evmcs(vcpu) ||
-                  !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
+                  !evmptr_is_valid(nested_get_evmptr(vcpu)))) {
                 if (vmptr == vmx->nested.current_vmptr)
                         nested_release_vmcs12(vcpu);
  
@@ -6128,6 +6189,11 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
                  * Handle L2's bus locks in L0 directly.
                  */
                 return true;
+       case EXIT_REASON_VMCALL:
+               /* Hyper-V L2 TLB flush hypercall is handled by L0 */
+               return guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
+                       nested_evmcs_l2_tlb_flush_enabled(vcpu) &&
+                       kvm_hv_is_tlb_flush_hcall(vcpu);
         default:
                 break;
         }
@@ -6440,9 +6506,6 @@ out:
         return kvm_state.size;
  }
  
-/*
- * Forcibly leave nested mode in order to be able to reset the VCPU later on.
- */
  void vmx_leave_nested(struct kvm_vcpu *vcpu)
  {
         if (is_guest_mode(vcpu)) {
@@ -6982,4 +7045,5 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
         .write_log_dirty = nested_vmx_write_pml_buffer,
         .enable_evmcs = nested_enable_evmcs,
         .get_evmcs_version = nested_get_evmcs_version,
+       .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush,
  };