KVM/x86: Fix invvpid and invept register operand size in 64-bit mode

[linux-2.6-microblaze.git] / arch / x86 / kvm / vmx.c
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c

index cdc4367..05a0e8f 100644 (file)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -110,6 +110,9 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
  static bool __read_mostly nested = 0;
  module_param(nested, bool, S_IRUGO);
  
+static bool __read_mostly nested_early_check = 0;
+module_param(nested_early_check, bool, S_IRUGO);
+
  static u64 __read_mostly host_xss;
  
  static bool __read_mostly enable_pml = 1;
@@ -187,6 +190,7 @@ static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
  module_param(ple_window_max, uint, 0444);
  
  extern const ulong vmx_return;
+extern const ulong vmx_early_consistency_check_return;
  
  static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
  static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
@@ -2066,7 +2070,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
         return -1;
  }
  
-static inline void __invvpid(int ext, u16 vpid, gva_t gva)
+static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
  {
      struct {
         u64 vpid : 16;
@@ -2081,7 +2085,7 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
      BUG_ON(error);
  }
  
-static inline void __invept(int ext, u64 eptp, gpa_t gpa)
+static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
  {
         struct {
                 u64 eptp, gpa;
@@ -2696,7 +2700,8 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
                 u64 guest_val, u64 host_val)
  {
         vmcs_write64(guest_val_vmcs, guest_val);
-       vmcs_write64(host_val_vmcs, host_val);
+       if (host_val_vmcs != HOST_IA32_EFER)
+               vmcs_write64(host_val_vmcs, host_val);
         vm_entry_controls_setbit(vmx, entry);
         vm_exit_controls_setbit(vmx, exit);
  }
@@ -5106,9 +5111,10 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
                                 bool invalidate_gpa)
  {
         if (enable_ept && (invalidate_gpa || !enable_vpid)) {
-               if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+               if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
                         return;
-               ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
+               ept_sync_context(construct_eptp(vcpu,
+                                               vcpu->arch.mmu->root_hpa));
         } else {
                 vpid_sync_context(vpid);
         }
@@ -6333,6 +6339,9 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
                 rdmsr(MSR_IA32_CR_PAT, low32, high32);
                 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
         }
+
+       if (cpu_has_load_ia32_efer)
+               vmcs_write64(HOST_IA32_EFER, host_efer);
  }
  
  static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
@@ -8060,35 +8069,37 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
  
  /*
   * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
- * set the success or error code of an emulated VMX instruction, as specified
- * by Vol 2B, VMX Instruction Reference, "Conventions".
+ * set the success or error code of an emulated VMX instruction (as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
+ * instruction.
   */
-static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
  {
         vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
                             X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+       return kvm_skip_emulated_instruction(vcpu);
  }
  
-static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
  {
         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
                         & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
                             X86_EFLAGS_SF | X86_EFLAGS_OF))
                         | X86_EFLAGS_CF);
+       return kvm_skip_emulated_instruction(vcpu);
  }
  
-static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-                                       u32 vm_instruction_error)
+static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
+                               u32 vm_instruction_error)
  {
-       if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
-               /*
-                * failValid writes the error number to the current VMCS, which
-                * can't be done there isn't a current VMCS.
-                */
-               nested_vmx_failInvalid(vcpu);
-               return;
-       }
+       /*
+        * failValid writes the error number to the current VMCS, which
+        * can't be done if there isn't a current VMCS.
+        */
+       if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
+               return nested_vmx_failInvalid(vcpu);
+
         vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
                         & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
                             X86_EFLAGS_SF | X86_EFLAGS_OF))
@@ -8098,6 +8109,7 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
          * We don't need to force a shadow sync because
          * VM_INSTRUCTION_ERROR is not shadowed
          */
+       return kvm_skip_emulated_instruction(vcpu);
  }
  
  static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
@@ -8339,10 +8351,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                 return 1;
         }
  
-       if (vmx->nested.vmxon) {
-               nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmx->nested.vmxon)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
  
         if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
                         != VMXON_NEEDED_FEATURES) {
@@ -8361,21 +8372,17 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
          * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
          * which replaces physical address width with 32
          */
-       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
-               nested_vmx_failInvalid(vcpu);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+               return nested_vmx_failInvalid(vcpu);
  
         page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-       if (is_error_page(page)) {
-               nested_vmx_failInvalid(vcpu);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (is_error_page(page))
+               return nested_vmx_failInvalid(vcpu);
+
         if (*(u32 *)kmap(page) != VMCS12_REVISION) {
                 kunmap(page);
                 kvm_release_page_clean(page);
-               nested_vmx_failInvalid(vcpu);
-               return kvm_skip_emulated_instruction(vcpu);
+               return nested_vmx_failInvalid(vcpu);
         }
         kunmap(page);
         kvm_release_page_clean(page);
@@ -8385,8 +8392,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
         if (ret)
                 return ret;
  
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
  }
  
  /*
@@ -8417,8 +8423,10 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
         vmcs_write64(VMCS_LINK_POINTER, -1ull);
  }
  
-static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
+static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
         if (vmx->nested.current_vmptr == -1ull)
                 return;
  
@@ -8432,10 +8440,12 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
         vmx->nested.posted_intr_nv = -1;
  
         /* Flush VMCS12 to guest memory */
-       kvm_vcpu_write_guest_page(&vmx->vcpu,
+       kvm_vcpu_write_guest_page(vcpu,
                                   vmx->nested.current_vmptr >> PAGE_SHIFT,
                                   vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
  
+       kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+
         vmx->nested.current_vmptr = -1ull;
  }
  
@@ -8443,8 +8453,10 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
   * Free whatever needs to be freed from vmx->nested when L1 goes down, or
   * just stops using VMX.
   */
-static void free_nested(struct vcpu_vmx *vmx)
+static void free_nested(struct kvm_vcpu *vcpu)
  {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
         if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
                 return;
  
@@ -8477,6 +8489,8 @@ static void free_nested(struct vcpu_vmx *vmx)
                 vmx->nested.pi_desc = NULL;
         }
  
+       kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+
         free_loaded_vmcs(&vmx->nested.vmcs02);
  }
  
@@ -8485,9 +8499,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
  {
         if (!nested_vmx_check_permission(vcpu))
                 return 1;
-       free_nested(to_vmx(vcpu));
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       free_nested(vcpu);
+       return nested_vmx_succeed(vcpu);
  }
  
  /* Emulate the VMCLEAR instruction */
@@ -8503,25 +8516,22 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
         if (nested_vmx_get_vmptr(vcpu, &vmptr))
                 return 1;
  
-       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
-               nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMCLEAR_INVALID_ADDRESS);
  
-       if (vmptr == vmx->nested.vmxon_ptr) {
-               nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmptr == vmx->nested.vmxon_ptr)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMCLEAR_VMXON_POINTER);
  
         if (vmptr == vmx->nested.current_vmptr)
-               nested_release_vmcs12(vmx);
+               nested_release_vmcs12(vcpu);
  
         kvm_vcpu_write_guest(vcpu,
                         vmptr + offsetof(struct vmcs12, launch_state),
                         &zero, sizeof(zero));
  
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
  }
  
  static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
@@ -8677,20 +8687,6 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
         vmcs_load(vmx->loaded_vmcs->vmcs);
  }
  
-/*
- * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
- * used before) all generate the same failure when it is missing.
- */
-static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       if (vmx->nested.current_vmptr == -1ull) {
-               nested_vmx_failInvalid(vcpu);
-               return 0;
-       }
-       return 1;
-}
-
  static int handle_vmread(struct kvm_vcpu *vcpu)
  {
         unsigned long field;
@@ -8703,8 +8699,8 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
         if (!nested_vmx_check_permission(vcpu))
                 return 1;
  
-       if (!nested_vmx_check_vmcs12(vcpu))
-               return kvm_skip_emulated_instruction(vcpu);
+       if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
+               return nested_vmx_failInvalid(vcpu);
  
         if (!is_guest_mode(vcpu))
                 vmcs12 = get_vmcs12(vcpu);
@@ -8713,20 +8709,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
                  * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
                  * to shadowed-field sets the ALU flags for VMfailInvalid.
                  */
-               if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
-                       nested_vmx_failInvalid(vcpu);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
+               if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
+                       return nested_vmx_failInvalid(vcpu);
                 vmcs12 = get_shadow_vmcs12(vcpu);
         }
  
         /* Decode instruction info and find the field to read */
         field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
         /* Read the field, zero-extended to a u64 field_value */
-       if (vmcs12_read_any(vmcs12, field, &field_value) < 0) {
-               nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmcs12_read_any(vmcs12, field, &field_value) < 0)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
         /*
          * Now copy part of this value to register or memory, as requested.
          * Note that the number of bits actually copied is 32 or 64 depending
@@ -8744,8 +8738,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
                                             (is_long_mode(vcpu) ? 8 : 4), NULL);
         }
  
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
  }
  
  
@@ -8770,8 +8763,8 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
         if (!nested_vmx_check_permission(vcpu))
                 return 1;
  
-       if (!nested_vmx_check_vmcs12(vcpu))
-               return kvm_skip_emulated_instruction(vcpu);
+       if (vmx->nested.current_vmptr == -1ull)
+               return nested_vmx_failInvalid(vcpu);
  
         if (vmx_instruction_info & (1u << 10))
                 field_value = kvm_register_readl(vcpu,
@@ -8794,11 +8787,9 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
          * VMCS," then the "read-only" fields are actually read/write.
          */
         if (vmcs_field_readonly(field) &&
-           !nested_cpu_has_vmwrite_any_field(vcpu)) {
-               nested_vmx_failValid(vcpu,
+           !nested_cpu_has_vmwrite_any_field(vcpu))
+               return nested_vmx_failValid(vcpu,
                         VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
  
         if (!is_guest_mode(vcpu))
                 vmcs12 = get_vmcs12(vcpu);
@@ -8807,18 +8798,14 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                  * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
                  * to shadowed-field sets the ALU flags for VMfailInvalid.
                  */
-               if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
-                       nested_vmx_failInvalid(vcpu);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
+               if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
+                       return nested_vmx_failInvalid(vcpu);
                 vmcs12 = get_shadow_vmcs12(vcpu);
-
         }
  
-       if (vmcs12_write_any(vmcs12, field, field_value) < 0) {
-               nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmcs12_write_any(vmcs12, field, field_value) < 0)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_UNSUPPORTED_VMCS_COMPONENT);
  
         /*
          * Do not track vmcs12 dirty-state if in guest-mode
@@ -8840,8 +8827,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                 }
         }
  
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
  }
  
  static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
@@ -8869,36 +8855,33 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
         if (nested_vmx_get_vmptr(vcpu, &vmptr))
                 return 1;
  
-       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
-               nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMPTRLD_INVALID_ADDRESS);
  
-       if (vmptr == vmx->nested.vmxon_ptr) {
-               nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmptr == vmx->nested.vmxon_ptr)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMPTRLD_VMXON_POINTER);
  
         if (vmx->nested.current_vmptr != vmptr) {
                 struct vmcs12 *new_vmcs12;
                 struct page *page;
                 page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-               if (is_error_page(page)) {
-                       nested_vmx_failInvalid(vcpu);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
+               if (is_error_page(page))
+                       return nested_vmx_failInvalid(vcpu);
+
                 new_vmcs12 = kmap(page);
                 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
                     (new_vmcs12->hdr.shadow_vmcs &&
                      !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
                         kunmap(page);
                         kvm_release_page_clean(page);
-                       nested_vmx_failValid(vcpu,
+                       return nested_vmx_failValid(vcpu,
                                 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
-                       return kvm_skip_emulated_instruction(vcpu);
                 }
  
-               nested_release_vmcs12(vmx);
+               nested_release_vmcs12(vcpu);
+
                 /*
                  * Load VMCS12 from guest memory since it is not already
                  * cached.
@@ -8910,8 +8893,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
                 set_current_vmptr(vmx, vmptr);
         }
  
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
  }
  
  /* Emulate the VMPTRST instruction */
@@ -8934,8 +8916,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
                 kvm_inject_page_fault(vcpu, &e);
                 return 1;
         }
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
  }
  
  /* Emulate the INVEPT instruction */
@@ -8965,11 +8946,9 @@ static int handle_invept(struct kvm_vcpu *vcpu)
  
         types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
  
-       if (type >= 32 || !(types & (1 << type))) {
-               nested_vmx_failValid(vcpu,
+       if (type >= 32 || !(types & (1 << type)))
+               return nested_vmx_failValid(vcpu,
                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
  
         /* According to the Intel VMX instruction reference, the memory
          * operand is read even if it isn't needed (e.g., for type==global)
@@ -8991,14 +8970,13 @@ static int handle_invept(struct kvm_vcpu *vcpu)
         case VMX_EPT_EXTENT_CONTEXT:
                 kvm_mmu_sync_roots(vcpu);
                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-               nested_vmx_succeed(vcpu);
                 break;
         default:
                 BUG_ON(1);
                 break;
         }
  
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
  }
  
  static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
@@ -9037,11 +9015,9 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
         types = (vmx->nested.msrs.vpid_caps &
                         VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
  
-       if (type >= 32 || !(types & (1 << type))) {
-               nested_vmx_failValid(vcpu,
+       if (type >= 32 || !(types & (1 << type)))
+               return nested_vmx_failValid(vcpu,
                         VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
  
         /* according to the intel vmx instruction reference, the memory
          * operand is read even if it isn't needed (e.g., for type==global)
@@ -9053,21 +9029,17 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
                 kvm_inject_page_fault(vcpu, &e);
                 return 1;
         }
-       if (operand.vpid >> 16) {
-               nested_vmx_failValid(vcpu,
+       if (operand.vpid >> 16)
+               return nested_vmx_failValid(vcpu,
                         VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
  
         vpid02 = nested_get_vpid02(vcpu);
         switch (type) {
         case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
                 if (!operand.vpid ||
-                   is_noncanonical_address(operand.gla, vcpu)) {
-                       nested_vmx_failValid(vcpu,
+                   is_noncanonical_address(operand.gla, vcpu))
+                       return nested_vmx_failValid(vcpu,
                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
                 if (cpu_has_vmx_invvpid_individual_addr()) {
                         __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
                                 vpid02, operand.gla);
@@ -9076,11 +9048,9 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
                 break;
         case VMX_VPID_EXTENT_SINGLE_CONTEXT:
         case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
-               if (!operand.vpid) {
-                       nested_vmx_failValid(vcpu,
+               if (!operand.vpid)
+                       return nested_vmx_failValid(vcpu,
                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
                 __vmx_flush_tlb(vcpu, vpid02, false);
                 break;
         case VMX_VPID_EXTENT_ALL_CONTEXT:
@@ -9091,9 +9061,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
                 return kvm_skip_emulated_instruction(vcpu);
         }
  
-       nested_vmx_succeed(vcpu);
-
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
  }
  
  static int handle_invpcid(struct kvm_vcpu *vcpu)
@@ -9164,11 +9132,11 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
                 }
  
                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-                       if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3)
+                       if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
                             == operand.pcid)
                                 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
  
-               kvm_mmu_free_roots(vcpu, roots_to_free);
+               kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
                 /*
                  * If neither the current cr3 nor any of the prev_roots use the
                  * given PCID, then nothing needs to be done here because a
@@ -9295,7 +9263,7 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
  
                 kvm_mmu_unload(vcpu);
                 mmu->ept_ad = accessed_dirty;
-               mmu->base_role.ad_disabled = !accessed_dirty;
+               mmu->mmu_role.base.ad_disabled = !accessed_dirty;
                 vmcs12->ept_pointer = address;
                 /*
                  * TODO: Check what's the correct approach in case
@@ -10969,12 +10937,10 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
   */
  static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
  {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-       vcpu_load(vcpu);
-       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-       free_nested(vmx);
-       vcpu_put(vcpu);
+       vcpu_load(vcpu);
+       vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
+       free_nested(vcpu);
+       vcpu_put(vcpu);
  }
  
  static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
@@ -11341,21 +11307,24 @@ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
  {
         WARN_ON(mmu_is_nested(vcpu));
  
+       vcpu->arch.mmu = &vcpu->arch.guest_mmu;
         kvm_init_shadow_ept_mmu(vcpu,
                         to_vmx(vcpu)->nested.msrs.ept_caps &
                         VMX_EPT_EXECUTE_ONLY_BIT,
                         nested_ept_ad_enabled(vcpu),
                         nested_ept_get_cr3(vcpu));
-       vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
-       vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
-       vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+       vcpu->arch.mmu->set_cr3           = vmx_set_cr3;
+       vcpu->arch.mmu->get_cr3           = nested_ept_get_cr3;
+       vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
+       vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
  
         vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
  }
  
  static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
  {
-       vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+       vcpu->arch.mmu = &vcpu->arch.root_mmu;
+       vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
  }
  
  static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
@@ -11999,6 +11968,14 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
                 return;
         vmx->nested.vmcs02_initialized = true;
  
+       /*
+        * We don't care what the EPTP value is we just need to guarantee
+        * it's valid so we don't get a false positive when doing early
+        * consistency checks.
+        */
+       if (enable_ept && nested_early_check)
+               vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
+
         /* All VMFUNCs are currently emulated through L0 vmexits.  */
         if (cpu_has_vmx_vmfunc())
                 vmcs_write64(VM_FUNCTION_CONTROL, 0);
@@ -12052,7 +12029,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
          * entry, but only if the current (host) sp changed from the value
          * we wrote last (vmx->host_rsp).  This cache is no longer relevant
          * if we switch vmcs, and rather than hold a separate cache per vmcs,
-        * here we just force the write to happen on entry.
+        * here we just force the write to happen on entry.  host_rsp will
+        * also be written unconditionally by nested_vmx_check_vmentry_hw()
+        * if we are doing early consistency checks via hardware.
          */
         vmx->host_rsp = 0;
  
@@ -12680,12 +12659,124 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
         return 0;
  }
  
+static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long cr3, cr4;
+
+       if (!nested_early_check)
+               return 0;
+
+       if (vmx->msr_autoload.host.nr)
+               vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+       if (vmx->msr_autoload.guest.nr)
+               vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+
+       preempt_disable();
+
+       vmx_prepare_switch_to_guest(vcpu);
+
+       /*
+        * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
+        * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
+        * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
+        * there is no need to preserve other bits or save/restore the field.
+        */
+       vmcs_writel(GUEST_RFLAGS, 0);
+
+       vmcs_writel(HOST_RIP, vmx_early_consistency_check_return);
+
+       cr3 = __get_current_cr3_fast();
+       if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
+               vmcs_writel(HOST_CR3, cr3);
+               vmx->loaded_vmcs->host_state.cr3 = cr3;
+       }
+
+       cr4 = cr4_read_shadow();
+       if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
+               vmcs_writel(HOST_CR4, cr4);
+               vmx->loaded_vmcs->host_state.cr4 = cr4;
+       }
+
+       vmx->__launched = vmx->loaded_vmcs->launched;
+
+       asm(
+               /* Set HOST_RSP */
+               __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
+               "mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t"
+
+               /* Check if vmlaunch of vmresume is needed */
+               "cmpl $0, %c[launched](%0)\n\t"
+               "je 1f\n\t"
+               __ex(ASM_VMX_VMRESUME) "\n\t"
+               "jmp 2f\n\t"
+               "1: " __ex(ASM_VMX_VMLAUNCH) "\n\t"
+               "jmp 2f\n\t"
+               "2: "
+
+               /* Set vmx->fail accordingly */
+               "setbe %c[fail](%0)\n\t"
+
+               ".pushsection .rodata\n\t"
+               ".global vmx_early_consistency_check_return\n\t"
+               "vmx_early_consistency_check_return: " _ASM_PTR " 2b\n\t"
+               ".popsection"
+             :
+             : "c"(vmx), "d"((unsigned long)HOST_RSP),
+               [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
+               [fail]"i"(offsetof(struct vcpu_vmx, fail)),
+               [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp))
+             : "rax", "cc", "memory"
+       );
+
+       vmcs_writel(HOST_RIP, vmx_return);
+
+       preempt_enable();
+
+       if (vmx->msr_autoload.host.nr)
+               vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+       if (vmx->msr_autoload.guest.nr)
+               vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+
+       if (vmx->fail) {
+               WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
+                            VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+               vmx->fail = 0;
+               return 1;
+       }
+
+       /*
+        * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
+        */
+       local_irq_enable();
+       if (hw_breakpoint_active())
+               set_debugreg(__this_cpu_read(cpu_dr7), 7);
+
+       /*
+        * A non-failing VMEntry means we somehow entered guest mode with
+        * an illegal RIP, and that's just the tip of the iceberg.  There
+        * is no telling what memory has been modified or what state has
+        * been exposed to unknown code.  Hitting this all but guarantees
+        * a (very critical) hardware issue.
+        */
+       WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
+               VMX_EXIT_REASONS_FAILED_VMENTRY));
+
+       return 0;
+}
+STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw);
+
  static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                                    struct vmcs12 *vmcs12);
  
  /*
   * If from_vmentry is false, this is being called from state restore (either RSM
   * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
++ *
++ * Returns:
++ *   0 - success, i.e. proceed with actual VMEnter
++ *   1 - consistency check VMExit
++ *  -1 - consistency check VMFail
   */
  static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
                                           bool from_vmentry)
@@ -12714,6 +12805,11 @@ static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
         if (from_vmentry) {
                 nested_get_vmcs12_pages(vcpu);
  
+               if (nested_vmx_check_vmentry_hw(vcpu)) {
+                       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+                       return -1;
+               }
+
                 if (check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
                         goto vmentry_fail_vmexit;
         }
@@ -12787,7 +12883,6 @@ vmentry_fail_vmexit:
         load_vmcs12_host_state(vcpu, vmcs12);
         vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
         vmcs12->exit_qualification = exit_qual;
-       nested_vmx_succeed(vcpu);
         if (enable_shadow_vmcs)
                 vmx->nested.sync_shadow_vmcs = true;
         return 1;
@@ -12807,8 +12902,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
         if (!nested_vmx_check_permission(vcpu))
                 return 1;
  
-       if (!nested_vmx_check_vmcs12(vcpu))
-               goto out;
+       if (vmx->nested.current_vmptr == -1ull)
+               return nested_vmx_failInvalid(vcpu);
  
         vmcs12 = get_vmcs12(vcpu);
  
@@ -12818,10 +12913,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
          * rather than RFLAGS.ZF, and no error number is stored to the
          * VM-instruction error field.
          */
-       if (vmcs12->hdr.shadow_vmcs) {
-               nested_vmx_failInvalid(vcpu);
-               goto out;
-       }
+       if (vmcs12->hdr.shadow_vmcs)
+               return nested_vmx_failInvalid(vcpu);
  
         if (enable_shadow_vmcs)
                 copy_shadow_to_vmcs12(vmx);
@@ -12836,36 +12929,31 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
          * for misconfigurations which will anyway be caught by the processor
          * when using the merged vmcs02.
          */
-       if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) {
-               nested_vmx_failValid(vcpu,
-                                    VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
-               goto out;
-       }
+       if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
  
-       if (vmcs12->launch_state == launch) {
-               nested_vmx_failValid(vcpu,
+       if (vmcs12->launch_state == launch)
+               return nested_vmx_failValid(vcpu,
                         launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
                                : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
-               goto out;
-       }
  
         ret = check_vmentry_prereqs(vcpu, vmcs12);
-       if (ret) {
-               nested_vmx_failValid(vcpu, ret);
-               goto out;
-       }
+       if (ret)
+               return nested_vmx_failValid(vcpu, ret);
  
         /*
          * We're finally done with prerequisite checking, and can start with
          * the nested entry.
          */
-
         vmx->nested.nested_run_pending = 1;
         ret = nested_vmx_enter_non_root_mode(vcpu, true);
-       if (ret) {
-               vmx->nested.nested_run_pending = 0;
+       vmx->nested.nested_run_pending = !ret;
+       if (ret > 0)
                 return 1;
-       }
+       else if (ret)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_ENTRY_INVALID_CONTROL_FIELD);
  
         /* Hide L1D cache contents from the nested guest.  */
         vmx->vcpu.arch.l1tf_flush_l1d = true;
@@ -12892,9 +12980,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
                 return kvm_vcpu_halt(vcpu);
         }
         return 1;
-
-out:
-       return kvm_skip_emulated_instruction(vcpu);
  }
  
  /*
@@ -13507,14 +13592,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
         /* trying to cancel vmlaunch/vmresume is a bug */
         WARN_ON_ONCE(vmx->nested.nested_run_pending);
  
-       /*
-        * The only expected VM-instruction error is "VM entry with
-        * invalid control field(s)." Anything else indicates a
-        * problem with L0.
-        */
-       WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) !=
-                                  VMXERR_ENTRY_INVALID_CONTROL_FIELD));
-
         leave_guest_mode(vcpu);
  
         if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
@@ -13541,6 +13618,16 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
                 if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
                                          vmcs12->vm_exit_msr_store_count))
                         nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+       } else {
+               /*
+                * The only expected VM-instruction error is "VM entry with
+                * invalid control field(s)." Anything else indicates a
+                * problem with L0.  And we should never get here with a
+                * VMFail of any type if early consistency checks are enabled.
+                */
+               WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
+                            VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+               WARN_ON_ONCE(nested_early_check);
         }
  
         vmx_switch_vmcs(vcpu, &vmx->vmcs01);
@@ -13623,7 +13710,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
  
                 return;
         }
-       
+
         /*
          * After an early L2 VM-entry failure, we're now back
          * in L1 which thinks it just finished a VMLAUNCH or
@@ -13631,9 +13718,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
          * flag and the VM-instruction error field of the VMCS
          * accordingly, and skip the emulated instruction.
          */
-       nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-
-       kvm_skip_emulated_instruction(vcpu);
+       (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
  
         /*
          * Restore L1's host state to KVM's software model.  We're here
@@ -13655,7 +13740,7 @@ static void vmx_leave_nested(struct kvm_vcpu *vcpu)
                 to_vmx(vcpu)->nested.nested_run_pending = 0;
                 nested_vmx_vmexit(vcpu, -1, 0, 0);
         }
-       free_nested(to_vmx(vcpu));
+       free_nested(vcpu);
  }
  
  static int vmx_check_intercept(struct kvm_vcpu *vcpu,