KVM/x86: Fix invvpid and invept register operand size in 64-bit mode
[linux-2.6-microblaze.git] / arch / x86 / kvm / vmx.c
index cdc4367..05a0e8f 100644 (file)
@@ -110,6 +110,9 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
 static bool __read_mostly nested = 0;
 module_param(nested, bool, S_IRUGO);
 
+static bool __read_mostly nested_early_check = 0;
+module_param(nested_early_check, bool, S_IRUGO);
+
 static u64 __read_mostly host_xss;
 
 static bool __read_mostly enable_pml = 1;
@@ -187,6 +190,7 @@ static unsigned int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
 module_param(ple_window_max, uint, 0444);
 
 extern const ulong vmx_return;
+extern const ulong vmx_early_consistency_check_return;
 
 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
@@ -2066,7 +2070,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
        return -1;
 }
 
-static inline void __invvpid(int ext, u16 vpid, gva_t gva)
+static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
 {
     struct {
        u64 vpid : 16;
@@ -2081,7 +2085,7 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
     BUG_ON(error);
 }
 
-static inline void __invept(int ext, u64 eptp, gpa_t gpa)
+static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
 {
        struct {
                u64 eptp, gpa;
@@ -2696,7 +2700,8 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
                u64 guest_val, u64 host_val)
 {
        vmcs_write64(guest_val_vmcs, guest_val);
-       vmcs_write64(host_val_vmcs, host_val);
+       if (host_val_vmcs != HOST_IA32_EFER)
+               vmcs_write64(host_val_vmcs, host_val);
        vm_entry_controls_setbit(vmx, entry);
        vm_exit_controls_setbit(vmx, exit);
 }
@@ -5106,9 +5111,10 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
                                bool invalidate_gpa)
 {
        if (enable_ept && (invalidate_gpa || !enable_vpid)) {
-               if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+               if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
                        return;
-               ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa));
+               ept_sync_context(construct_eptp(vcpu,
+                                               vcpu->arch.mmu->root_hpa));
        } else {
                vpid_sync_context(vpid);
        }
@@ -6333,6 +6339,9 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
                rdmsr(MSR_IA32_CR_PAT, low32, high32);
                vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
        }
+
+       if (cpu_has_load_ia32_efer)
+               vmcs_write64(HOST_IA32_EFER, host_efer);
 }
 
 static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
@@ -8060,35 +8069,37 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
 
 /*
  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
- * set the success or error code of an emulated VMX instruction, as specified
- * by Vol 2B, VMX Instruction Reference, "Conventions".
+ * set the success or error code of an emulated VMX instruction (as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
+ * instruction.
  */
-static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
 {
        vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
                            X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+       return kvm_skip_emulated_instruction(vcpu);
 }
 
-static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
 {
        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
                        & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
                            X86_EFLAGS_SF | X86_EFLAGS_OF))
                        | X86_EFLAGS_CF);
+       return kvm_skip_emulated_instruction(vcpu);
 }
 
-static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-                                       u32 vm_instruction_error)
+static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
+                               u32 vm_instruction_error)
 {
-       if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
-               /*
-                * failValid writes the error number to the current VMCS, which
-                * can't be done there isn't a current VMCS.
-                */
-               nested_vmx_failInvalid(vcpu);
-               return;
-       }
+       /*
+        * failValid writes the error number to the current VMCS, which
+        * can't be done if there isn't a current VMCS.
+        */
+       if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
+               return nested_vmx_failInvalid(vcpu);
+
        vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
                        & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
                            X86_EFLAGS_SF | X86_EFLAGS_OF))
@@ -8098,6 +8109,7 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
         * We don't need to force a shadow sync because
         * VM_INSTRUCTION_ERROR is not shadowed
         */
+       return kvm_skip_emulated_instruction(vcpu);
 }
 
 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
@@ -8339,10 +8351,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                return 1;
        }
 
-       if (vmx->nested.vmxon) {
-               nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmx->nested.vmxon)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
 
        if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
                        != VMXON_NEEDED_FEATURES) {
@@ -8361,21 +8372,17 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
         * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
         * which replaces physical address width with 32
         */
-       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
-               nested_vmx_failInvalid(vcpu);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+               return nested_vmx_failInvalid(vcpu);
 
        page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-       if (is_error_page(page)) {
-               nested_vmx_failInvalid(vcpu);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (is_error_page(page))
+               return nested_vmx_failInvalid(vcpu);
+
        if (*(u32 *)kmap(page) != VMCS12_REVISION) {
                kunmap(page);
                kvm_release_page_clean(page);
-               nested_vmx_failInvalid(vcpu);
-               return kvm_skip_emulated_instruction(vcpu);
+               return nested_vmx_failInvalid(vcpu);
        }
        kunmap(page);
        kvm_release_page_clean(page);
@@ -8385,8 +8392,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
        if (ret)
                return ret;
 
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
 }
 
 /*
@@ -8417,8 +8423,10 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
        vmcs_write64(VMCS_LINK_POINTER, -1ull);
 }
 
-static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
+static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
        if (vmx->nested.current_vmptr == -1ull)
                return;
 
@@ -8432,10 +8440,12 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
        vmx->nested.posted_intr_nv = -1;
 
        /* Flush VMCS12 to guest memory */
-       kvm_vcpu_write_guest_page(&vmx->vcpu,
+       kvm_vcpu_write_guest_page(vcpu,
                                  vmx->nested.current_vmptr >> PAGE_SHIFT,
                                  vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
 
+       kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+
        vmx->nested.current_vmptr = -1ull;
 }
 
@@ -8443,8 +8453,10 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
  * just stops using VMX.
  */
-static void free_nested(struct vcpu_vmx *vmx)
+static void free_nested(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
        if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
                return;
 
@@ -8477,6 +8489,8 @@ static void free_nested(struct vcpu_vmx *vmx)
                vmx->nested.pi_desc = NULL;
        }
 
+       kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+
        free_loaded_vmcs(&vmx->nested.vmcs02);
 }
 
@@ -8485,9 +8499,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
 {
        if (!nested_vmx_check_permission(vcpu))
                return 1;
-       free_nested(to_vmx(vcpu));
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       free_nested(vcpu);
+       return nested_vmx_succeed(vcpu);
 }
 
 /* Emulate the VMCLEAR instruction */
@@ -8503,25 +8516,22 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
        if (nested_vmx_get_vmptr(vcpu, &vmptr))
                return 1;
 
-       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
-               nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMCLEAR_INVALID_ADDRESS);
 
-       if (vmptr == vmx->nested.vmxon_ptr) {
-               nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmptr == vmx->nested.vmxon_ptr)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMCLEAR_VMXON_POINTER);
 
        if (vmptr == vmx->nested.current_vmptr)
-               nested_release_vmcs12(vmx);
+               nested_release_vmcs12(vcpu);
 
        kvm_vcpu_write_guest(vcpu,
                        vmptr + offsetof(struct vmcs12, launch_state),
                        &zero, sizeof(zero));
 
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
 }
 
 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
@@ -8677,20 +8687,6 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
        vmcs_load(vmx->loaded_vmcs->vmcs);
 }
 
-/*
- * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
- * used before) all generate the same failure when it is missing.
- */
-static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       if (vmx->nested.current_vmptr == -1ull) {
-               nested_vmx_failInvalid(vcpu);
-               return 0;
-       }
-       return 1;
-}
-
 static int handle_vmread(struct kvm_vcpu *vcpu)
 {
        unsigned long field;
@@ -8703,8 +8699,8 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
        if (!nested_vmx_check_permission(vcpu))
                return 1;
 
-       if (!nested_vmx_check_vmcs12(vcpu))
-               return kvm_skip_emulated_instruction(vcpu);
+       if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
+               return nested_vmx_failInvalid(vcpu);
 
        if (!is_guest_mode(vcpu))
                vmcs12 = get_vmcs12(vcpu);
@@ -8713,20 +8709,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
                 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
                 * to shadowed-field sets the ALU flags for VMfailInvalid.
                 */
-               if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
-                       nested_vmx_failInvalid(vcpu);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
+               if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
+                       return nested_vmx_failInvalid(vcpu);
                vmcs12 = get_shadow_vmcs12(vcpu);
        }
 
        /* Decode instruction info and find the field to read */
        field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
        /* Read the field, zero-extended to a u64 field_value */
-       if (vmcs12_read_any(vmcs12, field, &field_value) < 0) {
-               nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmcs12_read_any(vmcs12, field, &field_value) < 0)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
        /*
         * Now copy part of this value to register or memory, as requested.
         * Note that the number of bits actually copied is 32 or 64 depending
@@ -8744,8 +8738,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
                                            (is_long_mode(vcpu) ? 8 : 4), NULL);
        }
 
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
 }
 
 
@@ -8770,8 +8763,8 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
        if (!nested_vmx_check_permission(vcpu))
                return 1;
 
-       if (!nested_vmx_check_vmcs12(vcpu))
-               return kvm_skip_emulated_instruction(vcpu);
+       if (vmx->nested.current_vmptr == -1ull)
+               return nested_vmx_failInvalid(vcpu);
 
        if (vmx_instruction_info & (1u << 10))
                field_value = kvm_register_readl(vcpu,
@@ -8794,11 +8787,9 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
         * VMCS," then the "read-only" fields are actually read/write.
         */
        if (vmcs_field_readonly(field) &&
-           !nested_cpu_has_vmwrite_any_field(vcpu)) {
-               nested_vmx_failValid(vcpu,
+           !nested_cpu_has_vmwrite_any_field(vcpu))
+               return nested_vmx_failValid(vcpu,
                        VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
 
        if (!is_guest_mode(vcpu))
                vmcs12 = get_vmcs12(vcpu);
@@ -8807,18 +8798,14 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
                 * to shadowed-field sets the ALU flags for VMfailInvalid.
                 */
-               if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
-                       nested_vmx_failInvalid(vcpu);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
+               if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
+                       return nested_vmx_failInvalid(vcpu);
                vmcs12 = get_shadow_vmcs12(vcpu);
-
        }
 
-       if (vmcs12_write_any(vmcs12, field, field_value) < 0) {
-               nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmcs12_write_any(vmcs12, field, field_value) < 0)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_UNSUPPORTED_VMCS_COMPONENT);
 
        /*
         * Do not track vmcs12 dirty-state if in guest-mode
@@ -8840,8 +8827,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                }
        }
 
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
 }
 
 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
@@ -8869,36 +8855,33 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
        if (nested_vmx_get_vmptr(vcpu, &vmptr))
                return 1;
 
-       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) {
-               nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu)))
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMPTRLD_INVALID_ADDRESS);
 
-       if (vmptr == vmx->nested.vmxon_ptr) {
-               nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
+       if (vmptr == vmx->nested.vmxon_ptr)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_VMPTRLD_VMXON_POINTER);
 
        if (vmx->nested.current_vmptr != vmptr) {
                struct vmcs12 *new_vmcs12;
                struct page *page;
                page = kvm_vcpu_gpa_to_page(vcpu, vmptr);
-               if (is_error_page(page)) {
-                       nested_vmx_failInvalid(vcpu);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
+               if (is_error_page(page))
+                       return nested_vmx_failInvalid(vcpu);
+
                new_vmcs12 = kmap(page);
                if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
                    (new_vmcs12->hdr.shadow_vmcs &&
                     !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
                        kunmap(page);
                        kvm_release_page_clean(page);
-                       nested_vmx_failValid(vcpu,
+                       return nested_vmx_failValid(vcpu,
                                VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
-                       return kvm_skip_emulated_instruction(vcpu);
                }
 
-               nested_release_vmcs12(vmx);
+               nested_release_vmcs12(vcpu);
+
                /*
                 * Load VMCS12 from guest memory since it is not already
                 * cached.
@@ -8910,8 +8893,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
                set_current_vmptr(vmx, vmptr);
        }
 
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
 }
 
 /* Emulate the VMPTRST instruction */
@@ -8934,8 +8916,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
                kvm_inject_page_fault(vcpu, &e);
                return 1;
        }
-       nested_vmx_succeed(vcpu);
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
 }
 
 /* Emulate the INVEPT instruction */
@@ -8965,11 +8946,9 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 
        types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
 
-       if (type >= 32 || !(types & (1 << type))) {
-               nested_vmx_failValid(vcpu,
+       if (type >= 32 || !(types & (1 << type)))
+               return nested_vmx_failValid(vcpu,
                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
 
        /* According to the Intel VMX instruction reference, the memory
         * operand is read even if it isn't needed (e.g., for type==global)
@@ -8991,14 +8970,13 @@ static int handle_invept(struct kvm_vcpu *vcpu)
        case VMX_EPT_EXTENT_CONTEXT:
                kvm_mmu_sync_roots(vcpu);
                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-               nested_vmx_succeed(vcpu);
                break;
        default:
                BUG_ON(1);
                break;
        }
 
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
 }
 
 static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
@@ -9037,11 +9015,9 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
        types = (vmx->nested.msrs.vpid_caps &
                        VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
 
-       if (type >= 32 || !(types & (1 << type))) {
-               nested_vmx_failValid(vcpu,
+       if (type >= 32 || !(types & (1 << type)))
+               return nested_vmx_failValid(vcpu,
                        VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
 
        /* according to the intel vmx instruction reference, the memory
         * operand is read even if it isn't needed (e.g., for type==global)
@@ -9053,21 +9029,17 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
                kvm_inject_page_fault(vcpu, &e);
                return 1;
        }
-       if (operand.vpid >> 16) {
-               nested_vmx_failValid(vcpu,
+       if (operand.vpid >> 16)
+               return nested_vmx_failValid(vcpu,
                        VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-               return kvm_skip_emulated_instruction(vcpu);
-       }
 
        vpid02 = nested_get_vpid02(vcpu);
        switch (type) {
        case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
                if (!operand.vpid ||
-                   is_noncanonical_address(operand.gla, vcpu)) {
-                       nested_vmx_failValid(vcpu,
+                   is_noncanonical_address(operand.gla, vcpu))
+                       return nested_vmx_failValid(vcpu,
                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
                if (cpu_has_vmx_invvpid_individual_addr()) {
                        __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
                                vpid02, operand.gla);
@@ -9076,11 +9048,9 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
                break;
        case VMX_VPID_EXTENT_SINGLE_CONTEXT:
        case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
-               if (!operand.vpid) {
-                       nested_vmx_failValid(vcpu,
+               if (!operand.vpid)
+                       return nested_vmx_failValid(vcpu,
                                VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
-                       return kvm_skip_emulated_instruction(vcpu);
-               }
                __vmx_flush_tlb(vcpu, vpid02, false);
                break;
        case VMX_VPID_EXTENT_ALL_CONTEXT:
@@ -9091,9 +9061,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
                return kvm_skip_emulated_instruction(vcpu);
        }
 
-       nested_vmx_succeed(vcpu);
-
-       return kvm_skip_emulated_instruction(vcpu);
+       return nested_vmx_succeed(vcpu);
 }
 
 static int handle_invpcid(struct kvm_vcpu *vcpu)
@@ -9164,11 +9132,11 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
                }
 
                for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
-                       if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3)
+                       if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
                            == operand.pcid)
                                roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
 
-               kvm_mmu_free_roots(vcpu, roots_to_free);
+               kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
                /*
                 * If neither the current cr3 nor any of the prev_roots use the
                 * given PCID, then nothing needs to be done here because a
@@ -9295,7 +9263,7 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
 
                kvm_mmu_unload(vcpu);
                mmu->ept_ad = accessed_dirty;
-               mmu->base_role.ad_disabled = !accessed_dirty;
+               mmu->mmu_role.base.ad_disabled = !accessed_dirty;
                vmcs12->ept_pointer = address;
                /*
                 * TODO: Check what's the correct approach in case
@@ -10969,12 +10937,10 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
  */
 static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
 {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-       vcpu_load(vcpu);
-       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-       free_nested(vmx);
-       vcpu_put(vcpu);
+       vcpu_load(vcpu);
+       vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
+       free_nested(vcpu);
+       vcpu_put(vcpu);
 }
 
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
@@ -11341,21 +11307,24 @@ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 {
        WARN_ON(mmu_is_nested(vcpu));
 
+       vcpu->arch.mmu = &vcpu->arch.guest_mmu;
        kvm_init_shadow_ept_mmu(vcpu,
                        to_vmx(vcpu)->nested.msrs.ept_caps &
                        VMX_EPT_EXECUTE_ONLY_BIT,
                        nested_ept_ad_enabled(vcpu),
                        nested_ept_get_cr3(vcpu));
-       vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
-       vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
-       vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+       vcpu->arch.mmu->set_cr3           = vmx_set_cr3;
+       vcpu->arch.mmu->get_cr3           = nested_ept_get_cr3;
+       vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
+       vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
 
        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
 }
 
 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
 {
-       vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+       vcpu->arch.mmu = &vcpu->arch.root_mmu;
+       vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
 }
 
 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
@@ -11999,6 +11968,14 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
                return;
        vmx->nested.vmcs02_initialized = true;
 
+       /*
+        * We don't care what the EPTP value is we just need to guarantee
+        * it's valid so we don't get a false positive when doing early
+        * consistency checks.
+        */
+       if (enable_ept && nested_early_check)
+               vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
+
        /* All VMFUNCs are currently emulated through L0 vmexits.  */
        if (cpu_has_vmx_vmfunc())
                vmcs_write64(VM_FUNCTION_CONTROL, 0);
@@ -12052,7 +12029,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
         * entry, but only if the current (host) sp changed from the value
         * we wrote last (vmx->host_rsp).  This cache is no longer relevant
         * if we switch vmcs, and rather than hold a separate cache per vmcs,
-        * here we just force the write to happen on entry.
+        * here we just force the write to happen on entry.  host_rsp will
+        * also be written unconditionally by nested_vmx_check_vmentry_hw()
+        * if we are doing early consistency checks via hardware.
         */
        vmx->host_rsp = 0;
 
@@ -12680,12 +12659,124 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        return 0;
 }
 
+static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       unsigned long cr3, cr4;
+
+       if (!nested_early_check)
+               return 0;
+
+       if (vmx->msr_autoload.host.nr)
+               vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+       if (vmx->msr_autoload.guest.nr)
+               vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+
+       preempt_disable();
+
+       vmx_prepare_switch_to_guest(vcpu);
+
+       /*
+        * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
+        * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
+        * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
+        * there is no need to preserve other bits or save/restore the field.
+        */
+       vmcs_writel(GUEST_RFLAGS, 0);
+
+       vmcs_writel(HOST_RIP, vmx_early_consistency_check_return);
+
+       cr3 = __get_current_cr3_fast();
+       if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
+               vmcs_writel(HOST_CR3, cr3);
+               vmx->loaded_vmcs->host_state.cr3 = cr3;
+       }
+
+       cr4 = cr4_read_shadow();
+       if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
+               vmcs_writel(HOST_CR4, cr4);
+               vmx->loaded_vmcs->host_state.cr4 = cr4;
+       }
+
+       vmx->__launched = vmx->loaded_vmcs->launched;
+
+       asm(
+               /* Set HOST_RSP */
+               __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
+               "mov %%" _ASM_SP ", %c[host_rsp](%0)\n\t"
+
+               /* Check if vmlaunch of vmresume is needed */
+               "cmpl $0, %c[launched](%0)\n\t"
+               "je 1f\n\t"
+               __ex(ASM_VMX_VMRESUME) "\n\t"
+               "jmp 2f\n\t"
+               "1: " __ex(ASM_VMX_VMLAUNCH) "\n\t"
+               "jmp 2f\n\t"
+               "2: "
+
+               /* Set vmx->fail accordingly */
+               "setbe %c[fail](%0)\n\t"
+
+               ".pushsection .rodata\n\t"
+               ".global vmx_early_consistency_check_return\n\t"
+               "vmx_early_consistency_check_return: " _ASM_PTR " 2b\n\t"
+               ".popsection"
+             :
+             : "c"(vmx), "d"((unsigned long)HOST_RSP),
+               [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
+               [fail]"i"(offsetof(struct vcpu_vmx, fail)),
+               [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp))
+             : "rax", "cc", "memory"
+       );
+
+       vmcs_writel(HOST_RIP, vmx_return);
+
+       preempt_enable();
+
+       if (vmx->msr_autoload.host.nr)
+               vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
+       if (vmx->msr_autoload.guest.nr)
+               vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
+
+       if (vmx->fail) {
+               WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
+                            VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+               vmx->fail = 0;
+               return 1;
+       }
+
+       /*
+        * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
+        */
+       local_irq_enable();
+       if (hw_breakpoint_active())
+               set_debugreg(__this_cpu_read(cpu_dr7), 7);
+
+       /*
+        * A non-failing VMEntry means we somehow entered guest mode with
+        * an illegal RIP, and that's just the tip of the iceberg.  There
+        * is no telling what memory has been modified or what state has
+        * been exposed to unknown code.  Hitting this all but guarantees
+        * a (very critical) hardware issue.
+        */
+       WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
+               VMX_EXIT_REASONS_FAILED_VMENTRY));
+
+       return 0;
+}
+STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw);
+
 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                                   struct vmcs12 *vmcs12);
 
 /*
  * If from_vmentry is false, this is being called from state restore (either RSM
  * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
++ *
++ * Returns:
++ *   0 - success, i.e. proceed with actual VMEnter
++ *   1 - consistency check VMExit
++ *  -1 - consistency check VMFail
  */
 static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
                                          bool from_vmentry)
@@ -12714,6 +12805,11 @@ static int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
        if (from_vmentry) {
                nested_get_vmcs12_pages(vcpu);
 
+               if (nested_vmx_check_vmentry_hw(vcpu)) {
+                       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+                       return -1;
+               }
+
                if (check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
                        goto vmentry_fail_vmexit;
        }
@@ -12787,7 +12883,6 @@ vmentry_fail_vmexit:
        load_vmcs12_host_state(vcpu, vmcs12);
        vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
        vmcs12->exit_qualification = exit_qual;
-       nested_vmx_succeed(vcpu);
        if (enable_shadow_vmcs)
                vmx->nested.sync_shadow_vmcs = true;
        return 1;
@@ -12807,8 +12902,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        if (!nested_vmx_check_permission(vcpu))
                return 1;
 
-       if (!nested_vmx_check_vmcs12(vcpu))
-               goto out;
+       if (vmx->nested.current_vmptr == -1ull)
+               return nested_vmx_failInvalid(vcpu);
 
        vmcs12 = get_vmcs12(vcpu);
 
@@ -12818,10 +12913,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
         * rather than RFLAGS.ZF, and no error number is stored to the
         * VM-instruction error field.
         */
-       if (vmcs12->hdr.shadow_vmcs) {
-               nested_vmx_failInvalid(vcpu);
-               goto out;
-       }
+       if (vmcs12->hdr.shadow_vmcs)
+               return nested_vmx_failInvalid(vcpu);
 
        if (enable_shadow_vmcs)
                copy_shadow_to_vmcs12(vmx);
@@ -12836,36 +12929,31 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
         * for misconfigurations which will anyway be caught by the processor
         * when using the merged vmcs02.
         */
-       if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) {
-               nested_vmx_failValid(vcpu,
-                                    VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
-               goto out;
-       }
+       if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
 
-       if (vmcs12->launch_state == launch) {
-               nested_vmx_failValid(vcpu,
+       if (vmcs12->launch_state == launch)
+               return nested_vmx_failValid(vcpu,
                        launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
                               : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
-               goto out;
-       }
 
        ret = check_vmentry_prereqs(vcpu, vmcs12);
-       if (ret) {
-               nested_vmx_failValid(vcpu, ret);
-               goto out;
-       }
+       if (ret)
+               return nested_vmx_failValid(vcpu, ret);
 
        /*
         * We're finally done with prerequisite checking, and can start with
         * the nested entry.
         */
-
        vmx->nested.nested_run_pending = 1;
        ret = nested_vmx_enter_non_root_mode(vcpu, true);
-       if (ret) {
-               vmx->nested.nested_run_pending = 0;
+       vmx->nested.nested_run_pending = !ret;
+       if (ret > 0)
                return 1;
-       }
+       else if (ret)
+               return nested_vmx_failValid(vcpu,
+                       VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 
        /* Hide L1D cache contents from the nested guest.  */
        vmx->vcpu.arch.l1tf_flush_l1d = true;
@@ -12892,9 +12980,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
                return kvm_vcpu_halt(vcpu);
        }
        return 1;
-
-out:
-       return kvm_skip_emulated_instruction(vcpu);
 }
 
 /*
@@ -13507,14 +13592,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
        /* trying to cancel vmlaunch/vmresume is a bug */
        WARN_ON_ONCE(vmx->nested.nested_run_pending);
 
-       /*
-        * The only expected VM-instruction error is "VM entry with
-        * invalid control field(s)." Anything else indicates a
-        * problem with L0.
-        */
-       WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) !=
-                                  VMXERR_ENTRY_INVALID_CONTROL_FIELD));
-
        leave_guest_mode(vcpu);
 
        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
@@ -13541,6 +13618,16 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
                if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
                                         vmcs12->vm_exit_msr_store_count))
                        nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+       } else {
+               /*
+                * The only expected VM-instruction error is "VM entry with
+                * invalid control field(s)." Anything else indicates a
+                * problem with L0.  And we should never get here with a
+                * VMFail of any type if early consistency checks are enabled.
+                */
+               WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
+                            VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+               WARN_ON_ONCE(nested_early_check);
        }
 
        vmx_switch_vmcs(vcpu, &vmx->vmcs01);
@@ -13623,7 +13710,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
                return;
        }
-       
+
        /*
         * After an early L2 VM-entry failure, we're now back
         * in L1 which thinks it just finished a VMLAUNCH or
@@ -13631,9 +13718,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
         * flag and the VM-instruction error field of the VMCS
         * accordingly, and skip the emulated instruction.
         */
-       nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-
-       kvm_skip_emulated_instruction(vcpu);
+       (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 
        /*
         * Restore L1's host state to KVM's software model.  We're here
@@ -13655,7 +13740,7 @@ static void vmx_leave_nested(struct kvm_vcpu *vcpu)
                to_vmx(vcpu)->nested.nested_run_pending = 0;
                nested_vmx_vmexit(vcpu, -1, 0, 0);
        }
-       free_nested(to_vmx(vcpu));
+       free_nested(vcpu);
 }
 
 static int vmx_check_intercept(struct kvm_vcpu *vcpu,